knitr::opts_chunk$set(eval=FALSE)
library(dplyr)
library(magrittr)
library(purrr)
library(survival)
library(gtools)
library(stringr)
library(tidyverse)
library(survminer)
library(ggsurvfit)
library(pheatmap)
source("~/MRes_project_1/codes/5_plots/script/function.R")
ggsurvplot_customised <- function (fit_function, dataframe, pval_coord, break_time_by, title,
subtitle, legend_labs, p.val_method) {
p <- ggsurvplot(fit = fit_function, data = dataframe,
risk.table = TRUE, risk.table.height = 0.2, risk.table.y.text.col = TRUE,
risk.table.y.text = FALSE,
pval = TRUE, pval.method = FALSE, pval.coord = pval_coord, pval.size = 3,
log.rank.weights = p.val_method,
conf.int = TRUE, conf.int.style = "ribbon",
ncensor.plot = TRUE, ncensor.plot.height = 0.2, censor.size = 1,
censor.shape = 124,
legend.title = "status", legend.labs = legend_labs, legend = "top",
xlab = "Days", break.time.by = break_time_by,
surv.median.line = "hv",
fontsize = 3, font.family = "Arial", font.legend = c(9), size = 0.5,
ggtheme = theme_light(),
title = title, subtitle = subtitle,
surv.scale = "percent")
p <- customize_labels(p, font.title = c(9, "bold"),
font.subtitle = c(9, "italic", "darkgrey"), font.x = c(9),
font.y = c(9), font.xtickslab = c(8))
p$plot <- p$plot + geom_vline(xintercept = c(1826), size = 0.3, alpha = 0.5)
return(p)
}
if(!file.exists("~/MRes_project_1/docs/GISTIC2/lung_arm_complete.txt")){
tcga <- read.table("~/MRes_project_1/docs/GISTIC2/tcga/lung/broad_values_by_arm.txt",
sep = "\t", header = T)
mskcc <- read.table("~/MRes_project_1/docs/GISTIC2/cbioportal/lung_2/broad_values_by_arm.txt",
sep = "\t", header = T)
hh <- read.table("~/MRes_project_1/docs/GISTIC2/hh/lung_2/broad_values_by_arm.txt",
sep = "\t", header = T)
lung_arm <- left_join(tcga, mskcc, by="Chromosome.Arm")
lung_arm <- left_join(lung_arm, hh, by="Chromosome.Arm")
write.table(lung_arm, "~/MRes_project_1/docs/GISTIC2/lung_arm_complete.txt",
sep = "\t", col.names = T, row.names = F)
} else {
lung_arm <- read.table("~/MRes_project_1/docs/GISTIC2/lung_arm_complete.txt",
sep = "\t", header = T)
}
lung_arm[1:10, 1:10]
All of these samples are from non-small cell lung cancer patients.
# if raw combined file not exist
if(!file.exists("~/MRes_project_1/docs/00_mitéra/clinical/raw/lung/tcga_combined_raw.txt")){
## load
setwd("~/MRes_project_1/docs/00_mitéra/clinical/raw/lung")
tcga_clinical_ucsc <- read.table("tcga_lung_clinical_ucsc.txt", header = T, sep = "\t")
tcga_survival_ucsc <- read.table("tcga_lung_survival_ucsc.txt", header = T, sep = "\t")
tcga_lusc_cbp <- read.table("lusc_tcga_clinical_data.tsv", header = T, sep = "\t")
tcga_luad_cbp <- read.table("luad_tcga_clinical_data.tsv", header = T, sep = "\t")
## rename
tcga_clinical_ucsc <- tcga_clinical_ucsc %>% dplyr::rename(sample = sampleID)
tcga_lusc_cbp <- tcga_lusc_cbp %>% dplyr::rename(sample = Sample.ID)
tcga_luad_cbp <- tcga_luad_cbp %>% dplyr::rename(sample = Sample.ID)
## join
tcga <- left_join(tcga_clinical_ucsc, tcga_survival_ucsc, by="sample")
cbp <- rbind(tcga_lusc_cbp[, intersect(colnames(tcga_lusc_cbp), colnames(tcga_luad_cbp))],
tcga_luad_cbp[, intersect(colnames(tcga_lusc_cbp), colnames(tcga_luad_cbp))])
tcga <- left_join(tcga, cbp, by="sample")
## save
write.table(tcga, "~/MRes_project_1/docs/00_mitéra/clinical/raw/lung/tcga_combined_raw.txt",
sep = "\t", col.names = T, row.names = F, quote = F)
## if processed combine file not exist
} else if(!file.exists("~/MRes_project_1/docs/00_mitéra/clinical/processed/lung/tcga_combined.txt")){
## read in
tcga <- read.table("~/MRes_project_1/docs/00_mitéra/clinical/raw/lung/tcga_combined_raw.txt", sep = "\t", header = T)
## remove useless columns by...
## 1. more than 70% missing information
keep <- tcga %>% select_if(~is.numeric(.) | is.character(.)) %>% map_lgl(~sum(is.na(.) | . == "")/length(.) < 0.7)
tcga <- tcga[keep]
## 2. column with same values across
keep <- function(x){
x <- x[!is.na(x) & x != ""] ## not empty string or NA values
length(unique(x)) > 1} ## more than 1 unique values exist in the column
tcga <- tcga %>% select(where(keep))
## 3. sample columns
rownames(tcga) <- tcga$sample ## store the samples as rownames
tcga <- tcga %>% select_if(~!is.character(.) | ## remove columns with TCGA beginning
(is.character(.) &&
!any(startsWith(na.omit(.), "TCGA"))))
tcga$sample <- rownames(tcga) ## restore the sample column
## 4. UUID columns
remove <- tcga %>% ## define uuid columns and extract their column names
select_if(~any(str_detect(., regex("^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$")))) %>%
colnames()
tcga <- tcga %>% dplyr::select(setdiff(colnames(tcga), remove))
## combine columns
tcga <- tcga %>% mutate(Diagnosis.Age = if_else(is.na(Diagnosis.Age),
age_at_initial_pathologic_diagnosis, Diagnosis.Age)) ## age
tcga$smoke_yr <- tcga$Stopped.Smoking.Year - tcga$Started.Smoking.Year
## now manually select the columns you wish to keep and rename them if necessary
tcga <- tcga %>%
select(
## personal information
sample, Diagnosis.Age, gender, patient_id,
## tumour information
pathologic_stage, pathologic_M, pathologic_N, pathologic_T, sample_type,
residual_tumor, Cancer.Type.Detailed, new_tumor_event_after_initial_treatment,
longest_dimension, intermediate_dimension, shortest_dimension, anatomic_neoplasm_subdivision,
histological_type, Prior.Cancer.Diagnosis.Occurence,
## life-style
number_pack_years_smoked, tobacco_smoking_history, smoke_yr,
## treatment
history_of_neoadjuvant_treatment, targeted_molecular_therapy, radiation_therapy,
primary_therapy_outcome_success, followup_treatment_success,
## genomic information
Mutation.Count, Fraction.Genome.Altered, TMB..nonsynonymous.,
X_PANCAN_miRNA_PANCAN, X_PANCAN_mutation_PANCAN,
## survival status
OS, OS.time, DSS, DSS.time, DFI, DFI.time, PFI, PFI.time
) %>%
rename(age = Diagnosis.Age,
sex = gender,
stage = pathologic_stage,
cancer_type = Cancer.Type.Detailed,
new_tumour = new_tumor_event_after_initial_treatment,
anatomy_coord = anatomic_neoplasm_subdivision,
pack_yr = number_pack_years_smoked,
tobacco = tobacco_smoking_history,
prior_cancer = Prior.Cancer.Diagnosis.Occurence,
neoadjuvant_therapy = history_of_neoadjuvant_treatment,
targeted_therapy = targeted_molecular_therapy,
primary_outcome = primary_therapy_outcome_success,
secondline_outcome = followup_treatment_success,
mutation_count = Mutation.Count,
FGA = Fraction.Genome.Altered,
TMB = TMB..nonsynonymous.,
miRNA_cluster = X_PANCAN_miRNA_PANCAN,
mutation_cluster = X_PANCAN_mutation_PANCAN)
## add rownames
rownames(tcga) <- gsub("-", ".", tcga$sample)
## add the copy numbers to it
tcga_cna <- read.table("~/MRes_project_1/docs/GISTIC2/tcga/lung/broad_values_by_arm.txt", ## read in
sep = "\t", header = T, row.names = 1) %>% t() %>% as.data.frame()
colnames(tcga_cna) <- paste0("chr", colnames(tcga_cna)) ## change rownames
tcga <- tcga[rownames(tcga_cna), ] ## align
tcga <- merge(tcga, tcga_cna, by="row.names")
tcga$Row.names <- NULL
## change all empty strings to NA values
tcga <- lapply(tcga, function(x) ifelse(x == "", NA, x))
tcga <- tcga %>% as.data.frame()
## selectively mutate some columns
tcga$sex <- ifelse(tcga$sex == "MALE", 1, 0) ## male = 1, female = 0
tcga$new_tumour <- ifelse(tcga$new_tumour == "YES", 1, 0) ## yes = 1, no = 0
tcga$prior_cancer <- ifelse(tcga$prior_cancer == "Yes", 1, 0) ## yes = 1, no = 0
tcga$neoadjuvant_therapy <- ifelse(tcga$neoadjuvant_therapy == "Yes", 1, 0) ## yes = 1, no = 0
tcga$targeted_therapy <- ifelse(tcga$targeted_therapy == "YES", 1, 0) ## yes = 1, no = 0
tcga$radiation_therapy <- ifelse(tcga$radiation_therapy == "YES", 1, 0) ## yes = 1, no = 0
tcga <- tcga %>% mutate(primary_outcome = case_when(
primary_outcome == "Complete Remission/Response" ~ 2L,
primary_outcome == "Partial Remission/Response" ~ 1L,
primary_outcome == "Stable Disease" ~ 0L,
primary_outcome == "Progressive Disease" ~ -1L,
TRUE ~ NA_integer_
))
tcga <- tcga %>% mutate(secondline_outcome = case_when(
secondline_outcome == "Complete Remission/Response" ~ 2L,
secondline_outcome == "Partial Remission/Response" ~ 1L,
secondline_outcome == "Stable Disease" ~ 0L,
secondline_outcome == "Progressive Disease" ~ -1L,
TRUE ~ NA_integer_
))
tcga$primary_response <- ifelse(tcga$primary_outcome > 0, 1, 0) ## if primary outcome is 1 or 2, then response = 1, else 0
tcga$secondline_response <- ifelse(tcga$secondline_outcome > 0, 1, 0) ## same logic as above
tcga$stage <- roman2int(gsub("Stage\\s|A|B|C|Discrepancy|\\[|\\]", "", tcga$stage))
tcga$pathologic_M <- gsub("M", "", tcga$pathologic_M) %>% as.numeric()
tcga$pathologic_N <- gsub("N", "", tcga$pathologic_N) %>% as.numeric()
tcga$pathologic_T <- gsub("T", "", tcga$pathologic_T) %>% as.numeric()
tcga$sample_type <- ifelse(tcga$sample_type == "Primary Tumor", 0, 1) ## primary tumour = 0, recurrent = 1
tcga$residual_tumor <- ifelse(tcga$residual_tumor == "RX", NA, tcga$residual_tumor)
tcga$residual_tumor <- gsub("R", "", tcga$residual_tumor) %>% as.numeric() ## R0, no residual; R1, microscopic; R2, macroscopic
tcga$cancer_type <- ifelse(tcga$cancer_type == "Lung Adenocarcinoma", 1, 0) ## 1 = LUAD, 0 = LUSC
tcga$miRNA_cluster <- gsub("miRNA cluster ", "", tcga$miRNA_cluster) %>% as.numeric()
tcga$mutation_cluster <- gsub("mutation cluster |mutation c1uster ", "", tcga$mutation_cluster) %>% as.numeric()
tcga$anatomy_coord <- gsub(" \\(please specify\\)|\\[Discrepancy\\]", "", tcga$anatomy_coord)
tcga$anatomy_coord <- ifelse(tcga$anatomy_coord == "", NA, tcga$anatomy_coord)
## except for character columns listed, make every other column numeric
character_cols <- c("sample", "anatomy_coord", "histological_type")
tcga[, setdiff(colnames(tcga), character_cols)] <- apply(tcga[, setdiff(colnames(tcga), character_cols)], 2, as.numeric)
## save table
write.table(tcga, "~/MRes_project_1/docs/00_mitéra/clinical/processed/lung/tcga_combined.txt",
sep = "\t", col.names = T, row.names = F, quote = F)
}
tcga <- read.table("~/MRes_project_1/docs/00_mitéra/clinical/processed/lung/tcga_combined.txt",
sep = "\t", header = TRUE)
rownames(tcga) <- tcga$sample
character_cols <- c("sample", "anatomy_coord", "histological_type")
survival_cols <- c("OS", "OS.time", "DSS", "DSS.time", "DFI", "DFI.time", "PFI", "PFI.time")
cna_cols <- colnames(tcga)[grep("^chr", colnames(tcga))]
numeric_cols <- setdiff(colnames(tcga), c(character_cols, survival_cols, cna_cols))
plot_list <- list()
plot_cols <- c("age", "stage", "longest_dimension", "intermediate_dimension", "shortest_dimension",
"pack_yr", "tobacco", "smoke_yr", "mutation_count", "FGA", "TMB")
for (i in plot_cols) {
p <- ggplot(tcga, aes_string(x = "as.factor(sample_type)", y = i)) +
geom_violin(trim = FALSE) +
ggtitle(paste(i)) + xlab("sample type") +
theme(title = element_text(size = 8))
plot_list[[i]] <- p
}
plot_grid <- wrap_plots(plot_list, ncol = floor(sqrt(length(plot_cols))))
plot_grid
given a list of numeric columns numeric_cols perform
univaraite analysis of all these columns against patient survival given
as this formula:
surv_obj <- Surv(time = tcga$OS.time, event = tcga$OS)
and store the result in a dataframe called univarate with 5
columns (column name = HR, upper_95, lower_95, p_value, FDR) and
rownames being names from numeric_cols.
## univariate analysis on overall survival
univariate <- matrix(NA, nrow = length(numeric_cols), ncol = 5) %>% as.data.frame()
colnames(univariate) <- c("HR", "upper_95", "lower_95", "p_value", "FDR")
rownames(univariate) <- numeric_cols
for(i in numeric_cols){
test <- coxph(Surv(time=OS.time, event=OS)~tcga[,i], data = tcga)
test <- summary(test)
univariate[i, "HR"] <- test$coefficients[1, 2]
univariate[i, "p_value"] <- test$coefficients[1, 5]
univariate[i, "upper_95"] <- test$conf.int[1, 4]
univariate[i, "lower_95"] <- test$conf.int[1, 3]
}
univariate$FDR <- p.adjust(univariate$p_value, method = "fdr")
univariate$FDR <- round(univariate$FDR, digits = 4)
given a list of numeric columns cna_cols perform
multivariate analysis of all these columns against patient survival
given as this formula:
surv_obj <- Surv(time = tcga$OS.time, event = tcga$OS)
with age, stage, and sex as
initial covariates and store the result in a dataframe called
multivariate_1 with 5 columns (column name = HR, upper_95,
lower_95, p_value, FDR) and rownames being names from
cna_cols.
## multivariate analysis on overall survival
multivariate_1 <- matrix(NA, nrow = length(cna_cols), ncol = 5) %>% as.data.frame()
colnames(multivariate_1) <- c("HR", "upper_95", "lower_95", "p_value", "FDR")
rownames(multivariate_1) <- cna_cols
for(i in cna_cols){
test <- coxph(Surv(time=OS.time, event=OS)~tcga[,i]+age+sex+stage, data = tcga)
test <- summary(test)
multivariate_1[i, "HR"] <- test$coefficients[1, 2]
multivariate_1[i, "p_value"] <- test$coefficients[1, 5]
multivariate_1[i, "upper_95"] <- test$conf.int[1, 4]
multivariate_1[i, "lower_95"] <- test$conf.int[1, 3]
}
multivariate_1$FDR <- p.adjust(multivariate_1$p_value, method = "fdr")
multivariate_1$FDR <- round(multivariate_1$FDR, digits = 4)
multivariate_1 <- multivariate_1 %>% dplyr::arrange(p_value)
Based on univariate analysis we identified the following factors as
significantly associated with survival (fdr<0.05). but they do not
necessary have biological meaning.
added residual_tumour, new_tumour, and
radiation_therapy as covariate, remove sex
from covariate
## multivariate analysis on overall survival
multivariate_2 <- matrix(NA, nrow = length(cna_cols), ncol = 5) %>% as.data.frame()
colnames(multivariate_2) <- c("HR", "upper_95", "lower_95", "p_value", "FDR")
rownames(multivariate_2) <- cna_cols
for(i in cna_cols){
test <- coxph(Surv(time=OS.time, event=OS)~tcga[,i]+age+stage+residual_tumor+new_tumour+radiation_therapy, data = tcga)
test <- summary(test)
multivariate_2[i, "HR"] <- test$coefficients[1, 2]
multivariate_2[i, "p_value"] <- test$coefficients[1, 5]
multivariate_2[i, "upper_95"] <- test$conf.int[1, 4]
multivariate_2[i, "lower_95"] <- test$conf.int[1, 3]
}
multivariate_2$FDR <- p.adjust(multivariate_2$p_value, method = "fdr")
multivariate_2$FDR <- round(multivariate_2$FDR, digits = 4)
multivariate_2 <- multivariate_2 %>% dplyr::arrange(p_value)
Maybe should we separate LUAD and LUSC?
if(!file.exists("~/MRes_project_1/docs/00_mitéra/clinical/processed/lung/tcga_luad.txt")){
tcga_luad <- tcga %>% filter(cancer_type == 1)
tcga_luad_msi <- read.table("~/MRes_project_1/docs/00_mitéra/clinical/raw/lung/luad_MSI.txt",
sep = "\t", header = T) %>%
dplyr::select(Sample.ID, MSI.MANTIS.Score) %>%
dplyr::rename(sample = Sample.ID, MSI = MSI.MANTIS.Score)
tcga_luad_aneuploidy <- read.table("~/MRes_project_1/docs/00_mitéra/clinical/raw/lung/luad_aneuploidy.txt",
sep = "\t", header = T) %>%
dplyr::select(Sample.ID, Aneuploidy.Score) %>%
dplyr::rename(sample = Sample.ID, aneuploidy = Aneuploidy.Score)
tcga_luad <- left_join(tcga_luad, tcga_luad_msi, by="sample")
tcga_luad <- left_join(tcga_luad, tcga_luad_aneuploidy, by="sample")
write.table(tcga_luad, file = "~/MRes_project_1/docs/00_mitéra/clinical/processed/lung/tcga_luad.txt",
sep = "\t", quote = F, col.names = T, row.names = F)
}
if(!file.exists("~/MRes_project_1/docs/00_mitéra/clinical/processed/lung/tcga_lusc.txt")){
tcga_lusc <- tcga %>% filter(cancer_type == 0)
tcga_lusc_msi <- read.table("~/MRes_project_1/docs/00_mitéra/clinical/raw/lung/lusc_MSI.txt",
sep = "\t", header = T) %>%
dplyr::select(Sample.ID, MSI.MANTIS.Score) %>%
dplyr::rename(sample = Sample.ID, MSI = MSI.MANTIS.Score)
tcga_lusc <- left_join(tcga_lusc, tcga_lusc_msi, by="sample")
write.table(tcga_lusc, file = "~/MRes_project_1/docs/00_mitéra/clinical/processed/lung/tcga_lusc.txt",
sep = "\t", quote = F, col.names = T, row.names = F)
}
Multivariate analysis for LUAD regressing out age,
stage, and sex.
tcga_luad <- read.table("~/MRes_project_1/docs/00_mitéra/clinical/processed/lung/tcga_luad.txt",
sep = "\t", header = T)
rownames(tcga_luad) <- tcga_luad$sample
## multivariate analysis on overall survival with luad
luad <- matrix(NA, nrow = length(cna_cols), ncol = 5) %>% as.data.frame()
colnames(luad) <- c("HR", "upper_95", "lower_95", "p_value", "FDR")
rownames(luad) <- cna_cols
for(i in cna_cols){
test <- coxph(Surv(time=OS.time, event=OS)~tcga_luad[,i]+age+stage+sex, data = tcga_luad)
test <- summary(test)
luad[i, "HR"] <- test$coefficients[1, 2]
luad[i, "p_value"] <- test$coefficients[1, 5]
luad[i, "upper_95"] <- test$conf.int[1, 4]
luad[i, "lower_95"] <- test$conf.int[1, 3]
}
luad$FDR <- p.adjust(luad$p_value, method = "fdr")
luad$FDR <- round(luad$FDR, digits = 4)
luad <- luad %>% dplyr::arrange(p_value)
tcga_lusc <- read.table("~/MRes_project_1/docs/00_mitéra/clinical/processed/lung/tcga_lusc.txt",
sep = "\t", header = T)
## multivariate analysis on overall survival with lusc
lusc <- matrix(NA, nrow = length(cna_cols), ncol = 5) %>% as.data.frame()
colnames(lusc) <- c("HR", "upper_95", "lower_95", "p_value", "FDR")
rownames(lusc) <- cna_cols
for(i in cna_cols){
test <- coxph(Surv(time=OS.time, event=OS)~tcga_lusc[,i]+age+stage+sex, data = tcga_lusc)
test <- summary(test)
lusc[i, "HR"] <- test$coefficients[1, 2]
lusc[i, "p_value"] <- test$coefficients[1, 5]
lusc[i, "upper_95"] <- test$conf.int[1, 4]
lusc[i, "lower_95"] <- test$conf.int[1, 3]
}
lusc$FDR <- p.adjust(lusc$p_value, method = "fdr")
lusc$FDR <- round(lusc$FDR, digits = 4)
lusc <- lusc %>% dplyr::arrange(p_value)
After experimenting, the top hit chromosomes are quite different in
the two datasets, let’s first focus on lung adenocarcinoma. Covariate
regresing out age, stage, sex,
pack_yr, tobacco, FGA,
mutation_count, and TMB. age,
stage, and sex are regressed out a s personal
information, pack_yr and tobacco are regressed
out because smoking is shown to correlated with lung cancer
tumorigenesis and survival advantage. FGA,
mutation_count and TMB are regressed out
because they are shown to have prognostic value in other studies.
First, let’s look at lung adenocarcinoma
## multivariate analysis on overall survival with luad
luad_2 <- matrix(NA, nrow = length(cna_cols), ncol = 5) %>% as.data.frame()
colnames(luad_2) <- c("HR", "upper_95", "lower_95", "p_value", "FDR")
rownames(luad_2) <- cna_cols
for(i in cna_cols){
test <- coxph(Surv(time=OS.time, event=OS)~tcga_luad[,i]+age+stage+sex+pack_yr+tobacco+FGA+mutation_count+TMB, data = tcga_luad)
test <- summary(test)
luad_2[i, "HR"] <- test$coefficients[1, 2]
luad_2[i, "p_value"] <- test$coefficients[1, 5]
luad_2[i, "upper_95"] <- test$conf.int[1, 4]
luad_2[i, "lower_95"] <- test$conf.int[1, 3]
}
luad_2$FDR <- p.adjust(luad_2$p_value, method = "fdr")
luad_2$FDR <- round(luad_2$FDR, digits = 4)
luad_2 <- luad_2 %>% dplyr::arrange(p_value)
Next look at lung squamous cell carcinoma.
## multivariate analysis on overall survival with luad
lusc_2 <- matrix(NA, nrow = length(cna_cols), ncol = 5) %>% as.data.frame()
colnames(lusc_2) <- c("HR", "upper_95", "lower_95", "p_value", "FDR")
rownames(lusc_2) <- cna_cols
for(i in cna_cols){
test <- coxph(Surv(time=OS.time, event=OS)~tcga_lusc[,i]+age+stage+sex+pack_yr+tobacco+FGA+mutation_count+TMB,
data = tcga_lusc)
test <- summary(test)
lusc_2[i, "HR"] <- test$coefficients[1, 2]
lusc_2[i, "p_value"] <- test$coefficients[1, 5]
lusc_2[i, "upper_95"] <- test$conf.int[1, 4]
lusc_2[i, "lower_95"] <- test$conf.int[1, 3]
}
lusc_2$FDR <- p.adjust(lusc_2$p_value, method = "fdr")
lusc_2$FDR <- round(lusc_2$FDR, digits = 4)
lusc_2 <- lusc_2 %>% dplyr::arrange(p_value)
Lastly, apply the covariates to complete lung cancer dataset.
## multivariate analysis on overall survival
multivariate_3 <- matrix(NA, nrow = length(cna_cols), ncol = 5) %>% as.data.frame()
colnames(multivariate_3) <- c("HR", "upper_95", "lower_95", "p_value", "FDR")
rownames(multivariate_3) <- cna_cols
for(i in cna_cols){
test <- coxph(Surv(time=OS.time, event=OS)~tcga[,i]+age+stage+sex+pack_yr+tobacco+FGA+mutation_count+TMB, data = tcga)
test <- summary(test)
multivariate_3[i, "HR"] <- test$coefficients[1, 2]
multivariate_3[i, "p_value"] <- test$coefficients[1, 5]
multivariate_3[i, "upper_95"] <- test$conf.int[1, 4]
multivariate_3[i, "lower_95"] <- test$conf.int[1, 3]
}
multivariate_3$FDR <- p.adjust(multivariate_3$p_value, method = "fdr")
multivariate_3$FDR <- round(multivariate_3$FDR, digits = 4)
multivariate_3 <- multivariate_3 %>% dplyr::arrange(p_value)
In all lung cancer
## in all lung cancer samples
cna_status <- matrix(NA, nrow = 1, ncol = 3)
colnames(cna_status) <- c("amp", "del", "wt")
for(i in cna_cols){
table <- ifelse(tcga[i]>0, "amp", ifelse(tcga[i] < 0, "del", "wt")) %>% table() %>% t()
cna_status <- rbind(cna_status, table)
}
cna_status <- cna_status[2:nrow(cna_status), ] %>% as.data.frame()
rownames(cna_status) <- cna_cols
Row_Sums <- rowSums(cna_status)
cna_status <- sweep(x = cna_status, MARGIN = 1, STATS = Row_Sums, FUN = "/") ## calculate fraction
cna_status <- cna_status * 100 ## into percentage
cna_status$chr_arms <- cna_cols
## stack bar plot
cna_status_long <- cna_status %>% pivot_longer(cols = amp:wt, names_to = "Mutation", values_to = "Percentage")
# Create a stacked bar plot
p1 <- ggplot(cna_status_long, aes(x = chr_arms, y = Percentage, fill = Mutation)) +
geom_bar(stat = "identity") + coord_flip()+
labs(x = "chromosome arms", y = "Percentage") + theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
title = element_text(size = 8)) +
scale_fill_brewer(palette = "Set3") +
ggtitle("chr_arm status in TCGA lung cancer")
In lung adenocarcinoma
## in all lung cancer samples
cna_status <- matrix(NA, nrow = 1, ncol = 3)
colnames(cna_status) <- c("amp", "del", "wt")
for(i in cna_cols){
table <- ifelse(tcga_luad[i]>0, "amp", ifelse(tcga_luad[i] < 0, "del", "wt")) %>% table() %>% t()
cna_status <- rbind(cna_status, table)
}
cna_status <- cna_status[2:nrow(cna_status), ] %>% as.data.frame()
rownames(cna_status) <- cna_cols
Row_Sums <- rowSums(cna_status)
cna_status <- sweep(x = cna_status, MARGIN = 1, STATS = Row_Sums, FUN = "/") ## calculate fraction
cna_status <- cna_status * 100 ## into percentage
cna_status$chr_arms <- cna_cols
## stack bar plot
cna_status_long <- cna_status %>% pivot_longer(cols = amp:wt, names_to = "Mutation", values_to = "Percentage")
# Create a stacked bar plot
p2 <- ggplot(cna_status_long, aes(x = chr_arms, y = Percentage, fill = Mutation)) +
geom_bar(stat = "identity") + coord_flip()+
labs(x = "chromosome arms", y = "Percentage") + theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
title = element_text(size = 8)) +
scale_fill_brewer(palette = "Set3") +
ggtitle("chr_arm status in TCGA LUAD")
In lung squamous cell carcinoma
## in all lung cancer samples
cna_status <- matrix(NA, nrow = 1, ncol = 3)
colnames(cna_status) <- c("amp", "del", "wt")
for(i in cna_cols){
table <- ifelse(tcga_lusc[i]>0, "amp", ifelse(tcga_lusc[i] < 0, "del", "wt")) %>% table() %>% t()
cna_status <- rbind(cna_status, table)
}
cna_status <- cna_status[2:nrow(cna_status), ] %>% as.data.frame()
rownames(cna_status) <- cna_cols
Row_Sums <- rowSums(cna_status)
cna_status <- sweep(x = cna_status, MARGIN = 1, STATS = Row_Sums, FUN = "/") ## calculate fraction
cna_status <- cna_status * 100 ## into percentage
cna_status$chr_arms <- cna_cols
## stack bar plot
cna_status_long <- cna_status %>% pivot_longer(cols = amp:wt, names_to = "Mutation", values_to = "Percentage")
# Create a stacked bar plot
p3 <- ggplot(cna_status_long, aes(x = chr_arms, y = Percentage, fill = Mutation)) +
geom_bar(stat = "identity") + coord_flip()+
labs(x = "chromosome arms", y = "Percentage") + theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
title = element_text(size = 8)) +
scale_fill_brewer(palette = "Set3") +
ggtitle("chr_arm status in TCGA LUSC")
multivariate_3$Index <- factor(rownames(multivariate_3), levels = rownames(multivariate_3))
p4 <- ggplot(multivariate_3, aes(x = Index, y = HR)) +
geom_point() +
geom_errorbar(aes(ymin = lower_95, ymax = upper_95), width = 0.2) +
coord_flip() + # Flip coordinates to make it a traditional forest plot layout
theme_bw() +
labs(y = "log10 Hazard Ratio (HR)", x = "Chromosome Arms") +
geom_hline(yintercept = 1) +
ggtitle("TCGA Lung Cancer") + ylim(floor(min(multivariate_3$lower_95)), ceiling(max(multivariate_3$upper_95))) +
theme(title = element_text(size = 9))
luad_2$Index <- factor(rownames(luad_2), levels = rownames(luad_2))
p5 <- ggplot(luad_2, aes(x = Index, y = HR)) +
geom_point() +
geom_errorbar(aes(ymin = lower_95, ymax = upper_95), width = 0.2) +
coord_flip() + # Flip coordinates to make it a traditional forest plot layout
theme_bw() +
labs(y = "log10 Hazard Ratio (HR)", x = "Chromosome Arms") +
geom_hline(yintercept = 1) +
ggtitle("TCGA LUAD") + ylim(floor(min(luad_2$lower_95)), ceiling(max(luad_2$upper_95))) +
theme(title = element_text(size = 9))
lusc_2$Index <- factor(rownames(lusc_2), levels = rownames(lusc_2))
p6 <- ggplot(lusc_2, aes(x = Index, y = HR)) +
geom_point() +
geom_errorbar(aes(ymin = lower_95, ymax = upper_95), width = 0.2) +
coord_flip() + # Flip coordinates to make it a traditional forest plot layout
theme_bw() +
labs(y = "log10 Hazard Ratio (HR)", x = "Chromosome Arms") +
geom_hline(yintercept = 1) +
ggtitle("TCGA LUSC") + ylim(floor(min(lusc_2$lower_95)), ceiling(max(lusc_2$upper_95))) +
theme(title = element_text(size = 9))
tcga_luad$chr15qStatus <- ifelse(tcga_luad$chr15q > 0, "amp", "wt")
tcga_luad$chr22qStatus <- ifelse(tcga_luad$chr22q > 0, "amp", "wt")
fit <- survfit(Surv(OS.time, OS) ~ as.factor(chr22qStatus), data = tcga_luad)
p7 <- ggsurvplot_customised(fit_function = fit, dataframe = tcga_luad, pval_coord = c(3500, 0.75),
break_time_by = 1000, legend_labs = c("amp", "wt"), p.val_method = "1",
title = "Kaplan-Meier Curve for TCGA LUAD",
subtitle = "survival = OS, subtype = LUAD, n = 502, chr_arm = 22q")
fit <- survfit(Surv(OS.time, OS) ~ as.factor(chr15qStatus), data = tcga_luad)
p8 <- ggsurvplot_customised(fit_function = fit, dataframe = tcga_luad, pval_coord = c(3500, 0.75),
break_time_by = 1000, legend_labs = c("amp", "wt"), p.val_method = "1",
title = "Kaplan-Meier Curve for TCGA LUAD",
subtitle = "survival = OS, subtype = LUAD, n = 502, chr_arm = 15q")
tcga_lusc$chr22qStatus <- ifelse(tcga_lusc$chr22q < 0, "del", "wt")
tcga_lusc$chr15qStatus <- ifelse(tcga_lusc$chr15q < 0, "del", "wt")
fit <- survfit(Surv(OS.time, OS) ~ as.factor(chr22qStatus), data = tcga_lusc)
p9 <- ggsurvplot_customised(fit_function = fit, dataframe = tcga_lusc, pval_coord = c(3500, 0.75),
break_time_by = 500, legend_labs = c("del", "wt"), p.val_method = "1",
title = "Kaplan-Meier Curve for TCGA LUSC",
subtitle = "survival = OS, subtype = LUSC, n = 487, chr_arm = 22q")
fit <- survfit(Surv(OS.time, OS) ~ as.factor(chr15qStatus), data = tcga_lusc)
p10 <- ggsurvplot_customised(fit_function = fit, dataframe = tcga_lusc, pval_coord = c(3500, 0.75),
break_time_by = 500, legend_labs = c("del", "wt"), p.val_method = "1",
title = "Kaplan-Meier Curve for TCGA LUSC",
subtitle = "survival = OS, subtype = LUSC, n = 487, chr_arm = 15q")
tcga$chr4pStatus <- ifelse(tcga$chr4p > 0, "amp", "wt")
fit <- survfit(Surv(OS.time, OS) ~ as.factor(chr4pStatus), data = tcga)
p11 <- ggsurvplot_customised(fit_function = fit, dataframe = tcga, pval_coord = c(3500, 0.75),
break_time_by = 500, legend_labs = c("amp", "wt"), p.val_method = "1",
title = "Kaplan-Meier Curve for TCGA LUSC+LUAD",
subtitle = "survival = OS, subtype = LUSC+LUAD, n = 989, chr_arm = 4p")
p12 <- ggplot(tcga_luad, aes(x = as.factor(primary_outcome), y = chr15q))+geom_violin()+ggtitle("primary therapy, LUAD, 15q")
p13 <- ggplot(tcga_luad, aes(x = as.factor(primary_outcome), y = chr22q))+geom_violin()+ggtitle("primary therapy, LUAD, 22q")
p14 <- ggplot(tcga_luad, aes(x = as.factor(secondline_outcome), y = chr15q))+geom_violin()+ggtitle("secondline therapy, LUAD, 15q")
p15 <- ggplot(tcga_luad, aes(x = as.factor(secondline_outcome), y = chr22q))+geom_violin()+ggtitle("secondline therapy, LUAD, 22q")
aov(chr15q ~ as.factor(primary_outcome), data = tcga_luad) %>% summary()
aov(chr22q ~ as.factor(primary_outcome), data = tcga_luad) %>% summary()
aov(chr15q ~ as.factor(secondline_outcome), data = tcga_luad) %>% summary()
aov(chr22q ~ as.factor(secondline_outcome), data = tcga_luad) %>% summary()
p16 <- ggplot(tcga_luad, aes(x = as.factor(primary_response), y = chr15q))+geom_violin()+ggtitle("primary response, LUAD, 15q")
p17 <- ggplot(tcga_luad, aes(x = as.factor(primary_response), y = chr22q))+geom_violin()+ggtitle("primary response, LUAD, 22q")
p18 <- ggplot(tcga_luad, aes(x = as.factor(secondline_response), y = chr15q))+geom_violin()+ggtitle("secondline response, LUAD, 15q")
p19 <- ggplot(tcga_luad, aes(x = as.factor(secondline_response), y = chr22q))+geom_violin()+ggtitle("secondline response, LUAD, 22q")
wilcox.test(tcga_luad$chr15q[tcga_luad$primary_response == 1], tcga_luad$chr15q[tcga_luad$primary_response == 0])
wilcox.test(tcga_luad$chr22q[tcga_luad$primary_response == 1], tcga_luad$chr22q[tcga_luad$primary_response == 0])
wilcox.test(tcga_luad$chr15q[tcga_luad$secondline_response == 1], tcga_luad$chr15q[tcga_luad$secondline_response == 0])
wilcox.test(tcga_luad$chr22q[tcga_luad$secondline_response == 1], tcga_luad$chr22q[tcga_luad$secondline_response == 0])
p20 <- ggplot(tcga_lusc, aes(x = as.factor(primary_outcome), y = chr15q))+geom_violin()+ggtitle("primary therapy, LUSC, 15q")
p21 <- ggplot(tcga_lusc, aes(x = as.factor(primary_outcome), y = chr22q))+geom_violin()+ggtitle("primary therapy, LUSC, 22q")
p22 <- ggplot(tcga_lusc, aes(x = as.factor(secondline_outcome), y = chr15q))+geom_violin()+ggtitle("secondline therapy, LUSC, 15q")
p23 <- ggplot(tcga_lusc, aes(x = as.factor(secondline_outcome), y = chr22q))+geom_violin()+ggtitle("secondline therapy, LUSC, 22q")
aov(chr15q ~ as.factor(primary_outcome), data = tcga_lusc) %>% summary()
aov(chr22q ~ as.factor(primary_outcome), data = tcga_lusc) %>% summary()
aov(chr15q ~ as.factor(secondline_outcome), data = tcga_lusc) %>% summary()
aov(chr22q ~ as.factor(secondline_outcome), data = tcga_lusc) %>% summary()
p24 <- ggplot(tcga_lusc, aes(x = as.factor(primary_response), y = chr15q))+geom_violin()+ggtitle("primary response, LUSC, 15q")
p25 <- ggplot(tcga_lusc, aes(x = as.factor(primary_response), y = chr22q))+geom_violin()+ggtitle("primary response, LUSC, 22q")
p26 <- ggplot(tcga_lusc, aes(x = as.factor(secondline_response), y = chr15q))+geom_violin()+ggtitle("secondline response, LUSC, 15q")
p27 <- ggplot(tcga_lusc, aes(x = as.factor(secondline_response), y = chr22q))+geom_violin()+ggtitle("secondline response, LUSC, 22q")
wilcox.test(tcga_lusc$chr15q[tcga_lusc$primary_response == 1], tcga_lusc$chr15q[tcga_lusc$primary_response == 0])
wilcox.test(tcga_lusc$chr22q[tcga_lusc$primary_response == 1], tcga_lusc$chr22q[tcga_lusc$primary_response == 0])
wilcox.test(tcga_lusc$chr15q[tcga_lusc$secondline_response == 1], tcga_lusc$chr15q[tcga_lusc$secondline_response == 0])
wilcox.test(tcga_lusc$chr22q[tcga_lusc$secondline_response == 1], tcga_lusc$chr22q[tcga_lusc$secondline_response == 0])