Up regulated genes will be in reference to genes highly expressed within the TUMOUR EPITHELIUM SAMPLES.
library(Biobase)
library(oligoClasses)
library(ArrayExpress)
library(affycoretools)
#library(pd.hugene.1.0.st.v1)A
#library(hugene10sttranscriptcluster.db)
library(u133x3p.db)
library(oligo)
library(limma)
library(topGO)
library(clusterProfiler)
library(gplots)
library(ggplot2)
library(ggpubr)
library(geneplotter)
library(RColorBrewer)
library(pheatmap)
library(enrichplot)
library(dplyr)
library(tidyr)
library(stringr)
library(matrixStats)
library(genefilter)
library(openxlsx)
raw_data_dir <- "/data/projects/Laura_Barkley/E-GEOD-5847"
sdrf_location <- file.path(raw_data_dir, "E-GEOD-5847.sdrf.txt")
SDRF <- read.delim(sdrf_location)
rownames(SDRF) <- SDRF$Array.Data.File
SDRF <- AnnotatedDataFrame(SDRF)
raw_data <- oligo::read.celfiles(filenames = file.path(raw_data_dir, SDRF$Array.Data.File),
verbose = FALSE,
phenoData = SDRF)
meta <- read.table("/data/projects/Laura_Barkley/E-GEOD-5847/metadata.txt", header=T, sep="\t")
## re-arrange row.names(meta) to match colnames(raw_data)
index <- match(colnames(raw_data), row.names(meta))
meta_order <- meta[index, ]
meta <- meta_order
rm(meta_order)
Biobase::pData(raw_data) <- meta
eset <- oligo::rma(raw_data, normalize = TRUE)
## Background correcting
## Normalizing
## Calculating Expression
exp <- Biobase::exprs(eset)
PCA <- prcomp(t(exp), scale = FALSE)
percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
sd_ratio <- sqrt(percentVar[2] / percentVar[1])
dataGG <- data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
Tissue = meta$tissue)
ggpubr::ggscatter(dataGG, x="PC1", y="PC2",
color = "Tissue", palette = c("dodgerblue4", "darkorange2"),
title = "PCA plot log-transformed RMA normalized expression data",
xlab = paste0("PC1, VarExp: ", percentVar[1], "%"),
ylab = paste0("PC2, VarExp: ", percentVar[2], "%"),
ellipse = TRUE, star.plot = TRUE,
ggtheme = theme_bw()) +
theme(legend.position = "right") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
annotation_for_heatmap <-
data.frame(Tissue = meta$tissue)
row.names(annotation_for_heatmap) <- row.names(meta)
dists <- as.matrix(dist(t(exp), method = "manhattan"))
rownames(dists) <- row.names(meta)
hmcol <- rev(colorRampPalette(RColorBrewer::brewer.pal(9, "YlOrRd"))(255))
colnames(dists) <- NULL
diag(dists) <- NA
ann_colors <- list(
Tissue = c(stroma = "black", tumor_epithelium = "forestgreen"))
pheatmap(dists, col = (hmcol),
annotation_row = annotation_for_heatmap,
annotation_colors = ann_colors,
legend = TRUE,
show_rownames = FALSE,
treeheight_row = 0,
legend_breaks = c(min(dists, na.rm = TRUE),
max(dists, na.rm = TRUE)),
legend_labels = (c("small distance", "large distance")),
main = "Clustering heatmap RMA normalised samples")
medians <- rowMedians(Biobase::exprs(eset))
man_threshold <- 4
hist_res <- hist(medians, 100, col = "cornsilk1", freq = FALSE,
main = "Histogram of the median intensities",
border = "antiquewhite4",
xlab = "Median intensities")
abline(v = man_threshold, col = "coral4", lwd = 2)
no_of_samples <- table(meta$tissue)
sample_cutoff <- min(no_of_samples)
idx_man_threshold <- apply(Biobase::exprs(eset), 1, function(x){ sum(x > man_threshold) >= sample_cutoff})
table(idx_man_threshold) #2708 probes do not meet the cutoff
## idx_man_threshold
## FALSE TRUE
## 996 21287
print("996 PROBES do not meet the cut-off threshold (min intensity 4 in 46 samples. \n\n DISCARDING PROBES")
## [1] "996 PROBES do not meet the cut-off threshold (min intensity 4 in 46 samples. \n\n DISCARDING PROBES"
manual_filter <- subset(eset, idx_man_threshold)
probes <- rownames(manual_filter)
library(hgu133a.db)
annotation <- AnnotationDbi::select(hgu133a.db,
keys = probes,
columns = c("SYMBOL", "GENENAME"),
keytype = "PROBEID")
annotation <- subset(annotation, !is.na(SYMBOL))
## resolve multi maps.
ann_grouped <- group_by(annotation, PROBEID)
ann_sum <- dplyr::summarize(ann_grouped, no_of_matches = n_distinct(SYMBOL))
ann_flt <- filter(ann_sum, no_of_matches > 1)
remove_id <- (featureNames(manual_filter) %in% ann_flt$PROBEID)
table(remove_id)
## remove_id
## FALSE TRUE
## 20482 805
print("805 PROBES map to multiple gene IDs %% REMOVING PROBES")
## [1] "805 PROBES map to multiple gene IDs %% REMOVING PROBES"
final_annotation <- subset(annotation, !remove_id)
final_Obj <- subset(manual_filter, !remove_id)
fData(final_Obj)$PROBEID <- rownames(fData(final_Obj))
fData(final_Obj) <- left_join(fData(final_Obj), annotation)
rownames(fData(final_Obj)) <- fData(final_Obj)$PROBEID
tissue <- as.factor(meta$tissue)
status <- as.factor(meta$status)
status <- ifelse(status == "non-IBC", "non_IBC", "IBC")
patient_id <- as.factor(meta$patient_id)
design = model.matrix( ~ 0 + tissue + patient_id + status)
colnames(design)[1:2] <- c("Stroma", "Epithelium")
contrast_matrix <- makeContrasts(Epi_vs_Str = Epithelium-Stroma, levels = design)
fit_flt <- lmFit(final_Obj, design=design)
## Coefficients not estimable: statusnon_IBC
o <- order(fit_flt$Amean, decreasing = T)
fit_flt <- fit_flt[o,]
d <- duplicated(fit_flt$genes$SYMBOL)
fit_flt <- fit_flt[!d,]
fit_flt <- eBayes(contrasts.fit(fit_flt, contrast_matrix))
topT <- topTable(fit_flt, number = Inf)
up_reg_epi <- subset(topT, logFC > 0.25 & P.Value < 0.05)
library(DT)
DT::datatable(up_reg_epi, rownames = FALSE, options=list(scrollX=T))
load("/data/projects/D_O_Connor/results.RData")
up_isec <- intersect(up$Gene, up_reg_epi$SYMBOL)
up_isec
## [1] "GPR37" "SLC24A3" "TFAP2C" "PADI2" "DSG2" "SHROOM2" "CTSV"
## [8] "NLRP2" "GALNT3" "EFHD1" "HOXC10" "SORT1" "PIP4K2C" "PSEN2"