Up regulated genes will be in reference to genes highly expressed within the TUMOUR EPITHELIUM SAMPLES.
library(Biobase)
library(oligoClasses)
library(ArrayExpress)
library(affycoretools)
#library(pd.hugene.1.0.st.v1)A
#library(hugene10sttranscriptcluster.db)
library(u133x3p.db)
library(oligo)
library(limma)
library(topGO)
library(clusterProfiler)
library(gplots)
library(ggplot2)
library(ggpubr)
library(geneplotter)
library(RColorBrewer)
library(pheatmap)
library(enrichplot)
library(dplyr)
library(tidyr)
library(stringr)
library(matrixStats)
library(genefilter)
library(openxlsx)
raw_data_dir <- "/data/projects/Laura_Barkley/E-GEOD-5847"
sdrf_location <- file.path(raw_data_dir, "E-GEOD-5847.sdrf.txt")
SDRF <- read.delim(sdrf_location)
rownames(SDRF) <- SDRF$Array.Data.File
SDRF <- AnnotatedDataFrame(SDRF)
raw_data <- oligo::read.celfiles(filenames = file.path(raw_data_dir, SDRF$Array.Data.File),
verbose = FALSE,
phenoData = SDRF)
meta <- read.table("/data/projects/Laura_Barkley/E-GEOD-5847/metadata.txt", header=T, sep="\t")
## re-arrange row.names(meta) to match colnames(raw_data)
index <- match(colnames(raw_data), row.names(meta))
meta_order <- meta[index, ]
meta <- meta_order
rm(meta_order)
Biobase::pData(raw_data) <- meta
remove_idx <- row.names(meta)[which(meta$status == "IBC")]
eset <- oligo::rma(raw_data, normalize = TRUE)
## Background correcting
## Normalizing
## Calculating Expression
exp <- Biobase::exprs(eset)
exp <- as.data.frame(exp)
exp <- as.data.frame(exp[, !(colnames(exp) %in% remove_idx) ])
meta <- subset(meta, meta$status == "non-IBC")
index <- match(colnames(exp), row.names(meta))
meta_order <- meta[index, ]
meta <- meta_order
rm(meta_order)
table(rownames(meta) == colnames(exp))
##
## TRUE
## 68
PCA <- prcomp(t(exp), scale = FALSE)
percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
sd_ratio <- sqrt(percentVar[2] / percentVar[1])
dataGG <- data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
Tissue = meta$tissue)
ggpubr::ggscatter(dataGG, x="PC1", y="PC2",
color = "Tissue", palette = c("dodgerblue4", "darkorange2"),
title = "PCA plot log-transformed RMA normalized expression data\n [non-IBC PATIENTS]",
xlab = paste0("PC1, VarExp: ", percentVar[1], "%"),
ylab = paste0("PC2, VarExp: ", percentVar[2], "%"),
ellipse = TRUE, star.plot = TRUE,
ggtheme = theme_bw()) +
theme(legend.position = "right") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
annotation_for_heatmap <-
data.frame(Tissue = meta$tissue)
row.names(annotation_for_heatmap) <- row.names(meta)
dists <- as.matrix(dist(t(exp), method = "manhattan"))
rownames(dists) <- row.names(meta)
hmcol <- rev(colorRampPalette(RColorBrewer::brewer.pal(9, "YlOrRd"))(255))
colnames(dists) <- NULL
diag(dists) <- NA
ann_colors <- list(
Tissue = c(stroma = "black", tumor_epithelium = "forestgreen"))
pheatmap(dists, col = (hmcol),
annotation_row = annotation_for_heatmap,
annotation_colors = ann_colors,
legend = TRUE,
show_rownames = FALSE,
treeheight_row = 0,
legend_breaks = c(min(dists, na.rm = TRUE),
max(dists, na.rm = TRUE)),
legend_labels = (c("small distance", "large distance")),
main = "Clustering heatmap RMA normalised samples\n [non-IBC PATIENTS]")
medians <- rowMedians(Biobase::exprs(eset))
man_threshold <- 4
hist_res <- hist(medians, 100, col = "cornsilk1", freq = FALSE,
main = "Histogram of the median intensities",
border = "antiquewhite4",
xlab = "Median intensities")
abline(v = man_threshold, col = "coral4", lwd = 2)
no_of_samples <- table(meta$tissue)
sample_cutoff <- min(no_of_samples)
idx_man_threshold <- apply(Biobase::exprs(eset), 1, function(x){ sum(x > man_threshold) >= sample_cutoff})
table(idx_man_threshold) #2708 probes do not meet the cutoff
## idx_man_threshold
## FALSE TRUE
## 866 21417
print("866 PROBES do not meet the cut-off threshold (min intensity 4 in 34 samples. \n\n DISCARDING PROBES")
## [1] "866 PROBES do not meet the cut-off threshold (min intensity 4 in 34 samples. \n\n DISCARDING PROBES"
manual_filter <- subset(exp, idx_man_threshold)
probes <- rownames(manual_filter)
#BiocManager::install("hgu133a.db")
library(hgu133a.db)
annotation <- AnnotationDbi::select(hgu133a.db,
keys = probes,
columns = c("SYMBOL", "GENENAME"),
keytype = "PROBEID")
annotation <- subset(annotation, !is.na(SYMBOL))
## resolve multi maps.
ann_grouped <- group_by(annotation, PROBEID)
ann_sum <- dplyr::summarize(ann_grouped, no_of_matches = n_distinct(SYMBOL))
ann_flt <- filter(ann_sum, no_of_matches > 1)
remove_id <- (rownames(manual_filter) %in% ann_flt$PROBEID)
table(remove_id)
## remove_id
## FALSE TRUE
## 20605 812
print("812 PROBES map to multiple gene IDs %% REMOVING PROBES")
## [1] "812 PROBES map to multiple gene IDs %% REMOVING PROBES"
final_annotation <- subset(annotation, !remove_id)
final_Obj <- subset(manual_filter, !remove_id)
#fData(final_Obj)$PROBEID <- rownames(fData(final_Obj))
#fData(final_Obj) <- left_join(fData(final_Obj), annotation)
#rownames(fData(final_Obj)) <- fData(final_Obj)$PROBEID
tissue <- as.factor(meta$tissue)
patient_id <- as.factor(meta$patient_id)
design = model.matrix( ~ 0 + tissue + patient_id)
colnames(design)[1:2] <- c("Stroma", "Epithelium")
contrast_matrix <- makeContrasts(Epi_vs_Str = Epithelium-Stroma, levels = design)
fit <- eBayes(contrasts.fit(lmFit(final_Obj, design = design), contrast_matrix))
#fit_flt <- lmFit(final_Obj, design=design)
#o <- order(fit_flt$Amean, decreasing = T)
#fit_flt <- fit_flt[o,]
#d <- duplicated(fit_flt$genes$SYMBOL)
#fit_flt <- fit_flt[!d,]
#fit_flt <- eBayes(contrasts.fit(fit_flt, contrast_matrix))
topT <- topTable(fit, number = Inf)
# manually add ID, resolve multiple values for GENE ID
topT$PROBEID <- rownames(topT)
topT <- merge(topT, final_annotation, by="PROBEID")
## be nice and re-organise for continuity...
topT <- topT[,c(1,8,9,2,3,4,5,6,7)]
## resolve duplicate gene expr values (took highest abs Ave Expr)
topT <- topT[order(topT$SYMBOL, -abs(topT$AveExpr)),]
topT <- topT[!duplicated(topT$SYMBOL),]
up_reg_epi_non_ibc <- subset(topT, logFC > 0.25 & P.Value < 0.05)
library(DT)
DT::datatable(up_reg_epi_non_ibc, rownames = FALSE, options=list(scrollX=T))
load("/data/projects/D_O_Connor/results.RData")
up_isec <- intersect(up$Gene, up_reg_epi_non_ibc$SYMBOL)
up_isec
## [1] "GPR37" "TFAP2C" "PADI2" "DSG2" "SHROOM2" "CTSV" "NLRP2"
## [8] "GALNT3" "EFHD1" "HOXC10" "SORT1" "PIP4K2C" "PSEN2"