library(Biobase)
library(oligoClasses)
library(ArrayExpress)
library(affycoretools)
library(oligo)
library(limma)
library(topGO)
library(clusterProfiler)
library(gplots)
library(ggplot2)
library(ggpubr)
library(geneplotter)
library(RColorBrewer)
library(pheatmap)
library(enrichplot)
library(dplyr)
library(tidyr)
library(stringr)
library(matrixStats)
library(genefilter)
library(openxlsx)
library(ArrayExpress)
library(u133x3p.db) #unique to Experiment, check CEL header for info and download the corresponding package.
library(pd.hg.u133a)
Load raw CEL files, perform RMA normalisation.
raw_data_dir <- "/data/projects/Laura_Barkley/E-GEOD-4779"
sdrf_location <- file.path(raw_data_dir, "E-GEOD-4779.sdrf.txt")
SDRF <- read.delim(sdrf_location)
rownames(SDRF) <- SDRF$Array.Data.File
SDRF <- AnnotatedDataFrame(SDRF)
raw_data <- oligo::read.celfiles(filenames = file.path(raw_data_dir, SDRF$Array.Data.File),
verbose = FALSE,
phenoData = SDRF)
oligo::boxplot(raw_data, target = "core", main = "Boxplot of log2-intensities [raw data]") + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))
## NULL
eset <- oligo::rma(raw_data, normalize = TRUE)
## Background correcting
## Normalizing
## Calculating Expression
exp <- Biobase::exprs(eset)
oligo::boxplot(exp, target = "core", main = "Boxplot of log2-intensities [RMA normalised data]") + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))
## NULL
pDat <- pData(eset)
colnames(pDat)[5]<- "Pathological_Complete_Response"
PCA <- prcomp(t(exp), scale = FALSE)
percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
sd_ratio <- sqrt(percentVar[2] / percentVar[1])
dataGG <- data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
Response = pDat$Pathological_Complete_Response)
ggpubr::ggscatter(dataGG, x="PC1", y="PC2",
color = "Response", palette = c("dodgerblue4", "darkorange2"),
title = "PCA plot log-transformed RMA normalized expression data",
xlab = paste0("PC1, VarExp: ", percentVar[1], "%"),
ylab = paste0("PC2, VarExp: ", percentVar[2], "%"),
ellipse = TRUE, star.plot = TRUE,
ggtheme = theme_bw()) +
theme(legend.position = "right") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
annotation_for_heatmap <-
data.frame(Response = pDat$Pathological_Complete_Response)
row.names(annotation_for_heatmap) <- row.names(pData(eset))
dists <- as.matrix(dist(t(exp), method = "manhattan"))
rownames(dists) <- row.names(pData(eset))
hmcol <- rev(colorRampPalette(RColorBrewer::brewer.pal(9, "YlOrRd"))(255))
colnames(dists) <- NULL
diag(dists) <- NA
ann_colors <- list(
Response = c(pCR = "black", npCR = "forestgreen"))
pheatmap(dists, col = (hmcol),
annotation_row = annotation_for_heatmap,
annotation_colors = ann_colors,
legend = TRUE,
show_rownames = FALSE,
treeheight_row = 0,
legend_breaks = c(min(dists, na.rm = TRUE),
max(dists, na.rm = TRUE)),
legend_labels = (c("small distance", "large distance")),
main = "Clustering heatmap RMA normalised samples")
medians <- rowMedians(Biobase::exprs(eset))
man_threshold <- 4
hist_res <- hist(medians, 100, col = "cornsilk1", freq = FALSE,
main = "Histogram of the median intensities",
border = "antiquewhite4",
xlab = "Median intensities")
abline(v = man_threshold, col = "coral4", lwd = 2)
no_of_samples <- table(pDat$Pathological_Complete_Response)
sample_cutoff <- min(no_of_samples)
idx_man_threshold <- apply(Biobase::exprs(eset), 1, function(x){ sum(x > man_threshold) >= sample_cutoff})
table(idx_man_threshold) #2708 probes do not meet the cutoff
## idx_man_threshold
## FALSE TRUE
## 2708 58651
print("2708 PROBES do not meet the cut-off threshold (min intensity 4 in 39 samples. \n\n DISCARDING PROBES")
## [1] "2708 PROBES do not meet the cut-off threshold (min intensity 4 in 39 samples. \n\n DISCARDING PROBES"
manual_filter <- subset(eset, idx_man_threshold)
probes <- rownames(manual_filter)
annotation <- AnnotationDbi::select(u133x3p.db,
keys = probes,
columns = c("SYMBOL", "GENENAME"),
keytype = "PROBEID")
annotation <- subset(annotation, !is.na(SYMBOL))
## resolve multi maps.
ann_grouped <- group_by(annotation, PROBEID)
ann_sum <- dplyr::summarize(ann_grouped, no_of_matches = n_distinct(SYMBOL))
ann_flt <- filter(ann_sum, no_of_matches > 1)
remove_id <- (featureNames(manual_filter) %in% ann_flt$PROBEID)
table(remove_id)
## remove_id
## FALSE TRUE
## 55983 2668
print("2668 PROBES map to multiple gene IDs %% REMOVING PROBES")
## [1] "2668 PROBES map to multiple gene IDs %% REMOVING PROBES"
final_annotation <- subset(annotation, !remove_id)
final_Obj <- subset(manual_filter, !remove_id)
fData(final_Obj)$PROBEID <- rownames(fData(final_Obj))
fData(final_Obj) <- left_join(fData(final_Obj), annotation)
rownames(fData(final_Obj)) <- fData(final_Obj)$PROBEID
response <- as.factor(pDat$Pathological_Complete_Response)
design = model.matrix( ~ 0 + response )
colnames(design) <- c("npCR", "pCR")
contrast_matrix <- makeContrasts(npCR_vs_pCR = npCR-pCR, levels = design)
fit_flt <- lmFit(final_Obj, design=design)
o <- order(fit_flt$Amean, decreasing = T)
fit_flt <- fit_flt[o,]
d <- duplicated(fit_flt$genes$SYMBOL)
fit_flt <- fit_flt[!d,]
fit_flt <- eBayes(contrasts.fit(fit_flt, contrast_matrix))
topT <- topTable(fit_flt, number = Inf)
up_reg_npcr <- subset(topT, logFC > 0.25 & P.Value < 0.05)
down_reg_npcr <- subset(topT, logFC < -0.25 & P.Value < 0.05)
library(DT)
DT::datatable(up_reg_npcr, rownames = FALSE, options=list(scrollX=T))
load("/data/projects/D_O_Connor/results.RData")
up_isec <- intersect(up$Gene, up_reg_npcr$SYMBOL)
up_isec
## [1] "TFAP2C" "SYT1" "EDIL3" "AMIGO2" "PWWP3B" "SDR42E1" "PLN"
## [8] "PIEZO2"