E-GEOD-5847-nonIBC.RMd

Stromal vs Epithelial within nonIBC patients

Up regulated genes will be in reference to genes highly expressed within the TUMOUR EPITHELIUM SAMPLES.

library(Biobase)
library(oligoClasses)
library(ArrayExpress)
library(affycoretools)
#library(pd.hugene.1.0.st.v1)A
#library(hugene10sttranscriptcluster.db)
library(u133x3p.db) 
library(oligo)
library(limma)
library(topGO)
library(clusterProfiler)
library(gplots)
library(ggplot2)
library(ggpubr)
library(geneplotter)
library(RColorBrewer)
library(pheatmap)
library(enrichplot)
library(dplyr)
library(tidyr)
library(stringr)
library(matrixStats)
library(genefilter)
library(openxlsx)

raw_data_dir <- "/data/projects/Laura_Barkley/E-GEOD-5847"
sdrf_location <- file.path(raw_data_dir, "E-GEOD-5847.sdrf.txt")
SDRF <- read.delim(sdrf_location)

rownames(SDRF) <- SDRF$Array.Data.File
SDRF <- AnnotatedDataFrame(SDRF)

raw_data <- oligo::read.celfiles(filenames = file.path(raw_data_dir, SDRF$Array.Data.File),
                                 verbose = FALSE,
                                 phenoData = SDRF)

meta <- read.table("/data/projects/Laura_Barkley/E-GEOD-5847/metadata.txt", header=T, sep="\t")

## re-arrange row.names(meta) to match colnames(raw_data)
index <- match(colnames(raw_data), row.names(meta))

meta_order <- meta[index, ]
meta <- meta_order 
rm(meta_order)

Biobase::pData(raw_data) <- meta

Subset dataset to include only non-IBC patients..

remove_idx <- row.names(meta)[which(meta$status == "IBC")]

eset <- oligo::rma(raw_data,  normalize = TRUE)

## Background correcting
## Normalizing
## Calculating Expression

exp <- Biobase::exprs(eset)

exp <- as.data.frame(exp)

exp <- as.data.frame(exp[, !(colnames(exp) %in% remove_idx) ])

meta <- subset(meta, meta$status == "non-IBC")

index <- match(colnames(exp), row.names(meta))

meta_order <- meta[index, ]
meta <- meta_order 
rm(meta_order)

table(rownames(meta) == colnames(exp))

## 
## TRUE 
##   68

PCA

PCA <- prcomp(t(exp), scale = FALSE)

percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
sd_ratio <- sqrt(percentVar[2] / percentVar[1])

dataGG <- data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
                    Tissue = meta$tissue)

ggpubr::ggscatter(dataGG, x="PC1", y="PC2",
                  color = "Tissue", palette = c("dodgerblue4", "darkorange2"),
                  title = "PCA plot log-transformed RMA normalized expression data\n [non-IBC PATIENTS]",
                  xlab = paste0("PC1, VarExp: ", percentVar[1], "%"),
                  ylab = paste0("PC2, VarExp: ", percentVar[2], "%"),
                  ellipse = TRUE, star.plot = TRUE,
                  ggtheme = theme_bw()) + 
                  theme(legend.position = "right") + 
                  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

Samples

annotation_for_heatmap <- 
  data.frame(Tissue = meta$tissue)

row.names(annotation_for_heatmap) <- row.names(meta)

dists <- as.matrix(dist(t(exp), method = "manhattan"))

rownames(dists) <- row.names(meta)
hmcol <- rev(colorRampPalette(RColorBrewer::brewer.pal(9, "YlOrRd"))(255))
colnames(dists) <- NULL
diag(dists) <- NA

ann_colors <- list(
  Tissue = c(stroma = "black", tumor_epithelium = "forestgreen"))

pheatmap(dists, col = (hmcol), 
         annotation_row = annotation_for_heatmap,
         annotation_colors = ann_colors,
         legend = TRUE, 
         show_rownames = FALSE,
         treeheight_row = 0,
         legend_breaks = c(min(dists, na.rm = TRUE), 
                         max(dists, na.rm = TRUE)), 
         legend_labels = (c("small distance", "large distance")),
         main = "Clustering heatmap RMA normalised samples\n [non-IBC PATIENTS]")

Probe filtering

medians <- rowMedians(Biobase::exprs(eset))
man_threshold <- 4


hist_res <- hist(medians, 100, col = "cornsilk1", freq = FALSE, 
            main = "Histogram of the median intensities", 
            border = "antiquewhite4",
            xlab = "Median intensities")
abline(v = man_threshold, col = "coral4", lwd = 2)

no_of_samples <- table(meta$tissue)

sample_cutoff <- min(no_of_samples)

idx_man_threshold <- apply(Biobase::exprs(eset), 1, function(x){ sum(x > man_threshold) >= sample_cutoff})

table(idx_man_threshold) #2708 probes do not meet the cutoff

## idx_man_threshold
## FALSE  TRUE 
##   866 21417

print("866 PROBES do not meet the cut-off threshold (min intensity 4 in 34 samples. \n\n DISCARDING PROBES")

## [1] "866 PROBES do not meet the cut-off threshold (min intensity 4 in 34 samples. \n\n DISCARDING PROBES"

manual_filter <- subset(exp, idx_man_threshold)

probes <- rownames(manual_filter)

#BiocManager::install("hgu133a.db")

library(hgu133a.db)

annotation <- AnnotationDbi::select(hgu133a.db,
                                    keys = probes,
                                    columns = c("SYMBOL", "GENENAME"),
                                    keytype = "PROBEID")


annotation <- subset(annotation, !is.na(SYMBOL))

## resolve multi maps. 

ann_grouped <- group_by(annotation, PROBEID)
ann_sum <- dplyr::summarize(ann_grouped, no_of_matches = n_distinct(SYMBOL))

ann_flt <- filter(ann_sum, no_of_matches > 1)

remove_id <- (rownames(manual_filter) %in% ann_flt$PROBEID)

table(remove_id)

## remove_id
## FALSE  TRUE 
## 20605   812

print("812 PROBES map to multiple gene IDs %% REMOVING PROBES")

## [1] "812 PROBES map to multiple gene IDs %% REMOVING PROBES"

final_annotation <- subset(annotation, !remove_id)
final_Obj <- subset(manual_filter, !remove_id)

#fData(final_Obj)$PROBEID <- rownames(fData(final_Obj))

#fData(final_Obj) <- left_join(fData(final_Obj), annotation)

#rownames(fData(final_Obj)) <- fData(final_Obj)$PROBEID

Lm

tissue <- as.factor(meta$tissue)

patient_id <- as.factor(meta$patient_id)

design = model.matrix( ~ 0 + tissue + patient_id) 
colnames(design)[1:2] <- c("Stroma", "Epithelium")

contrast_matrix <- makeContrasts(Epi_vs_Str  = Epithelium-Stroma, levels = design) 

fit <- eBayes(contrasts.fit(lmFit(final_Obj, design = design), contrast_matrix))

#fit_flt <- lmFit(final_Obj, design=design)
#o <- order(fit_flt$Amean, decreasing = T)
#fit_flt <- fit_flt[o,]
#d <- duplicated(fit_flt$genes$SYMBOL)
#fit_flt <- fit_flt[!d,]
#fit_flt <- eBayes(contrasts.fit(fit_flt, contrast_matrix))

topT <- topTable(fit, number = Inf)

# manually add ID, resolve multiple values for GENE ID
topT$PROBEID <- rownames(topT)

topT <- merge(topT, final_annotation, by="PROBEID")

## be nice and re-organise for continuity...

topT <- topT[,c(1,8,9,2,3,4,5,6,7)]

## resolve duplicate gene expr values (took highest abs Ave Expr)

topT <- topT[order(topT$SYMBOL, -abs(topT$AveExpr)),]

topT <- topT[!duplicated(topT$SYMBOL),]


up_reg_epi_non_ibc <- subset(topT, logFC > 0.25 & P.Value < 0.05)

Up-Regulated Genes (Tumor Epithelium in non-IBC PATIENTS)

library(DT)
DT::datatable(up_reg_epi_non_ibc, rownames = FALSE, options=list(scrollX=T))

Overlapping Genes (TSC & Tumor Epithelium [non-IBC patients])

load("/data/projects/D_O_Connor/results.RData")

up_isec <- intersect(up$Gene, up_reg_epi_non_ibc$SYMBOL)

up_isec

##  [1] "GPR37"   "TFAP2C"  "PADI2"   "DSG2"    "SHROOM2" "CTSV"    "NLRP2"  
##  [8] "GALNT3"  "EFHD1"   "HOXC10"  "SORT1"   "PIP4K2C" "PSEN2"

E-GEOD-5847-nonIBC.RMd

Barry

8/31/2021

Stromal vs Epithelial within nonIBC patients

Subset dataset to include only non-IBC patients..

PCA

Samples

Probe filtering

Lm

Up-Regulated Genes (Tumor Epithelium in non-IBC PATIENTS)

Overlapping Genes (TSC & Tumor Epithelium [non-IBC patients])