library(Biobase)
library(oligoClasses)
library(ArrayExpress)
library(affycoretools)
library(oligo)
library(limma)
library(topGO)
library(clusterProfiler)
library(gplots)
library(ggplot2)
library(ggpubr)
library(geneplotter)
library(RColorBrewer)
library(pheatmap)
library(enrichplot)
library(dplyr)
library(tidyr)
library(stringr)
library(matrixStats)
library(genefilter)
library(openxlsx)
library(ArrayExpress)
library(u133x3p.db) #unique to Experiment, check CEL header for info and download the corresponding package. 
library(pd.hg.u133a)

PreProc

Load raw CEL files, perform RMA normalisation.

raw_data_dir <- "/data/projects/Laura_Barkley/E-GEOD-4779"
sdrf_location <- file.path(raw_data_dir, "E-GEOD-4779.sdrf.txt")
SDRF <- read.delim(sdrf_location)

rownames(SDRF) <- SDRF$Array.Data.File
SDRF <- AnnotatedDataFrame(SDRF)

raw_data <- oligo::read.celfiles(filenames = file.path(raw_data_dir, SDRF$Array.Data.File),
                                 verbose = FALSE,
                                 phenoData = SDRF)

oligo::boxplot(raw_data, target = "core", main = "Boxplot of log2-intensities [raw data]") + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## NULL
eset <- oligo::rma(raw_data,  normalize = TRUE)
## Background correcting
## Normalizing
## Calculating Expression
exp <- Biobase::exprs(eset)

oligo::boxplot(exp, target = "core", main = "Boxplot of log2-intensities [RMA normalised data]") + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## NULL

PCA

pDat <- pData(eset)

colnames(pDat)[5]<- "Pathological_Complete_Response"

PCA <- prcomp(t(exp), scale = FALSE)

percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
sd_ratio <- sqrt(percentVar[2] / percentVar[1])

dataGG <- data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
                    Response = pDat$Pathological_Complete_Response)

ggpubr::ggscatter(dataGG, x="PC1", y="PC2",
                  color = "Response", palette = c("dodgerblue4", "darkorange2"),
                  title = "PCA plot log-transformed RMA normalized expression data",
                  xlab = paste0("PC1, VarExp: ", percentVar[1], "%"),
                  ylab = paste0("PC2, VarExp: ", percentVar[2], "%"),
                  ellipse = TRUE, star.plot = TRUE,
                  ggtheme = theme_bw()) + 
                  theme(legend.position = "right") + 
                  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

Samples

annotation_for_heatmap <- 
  data.frame(Response = pDat$Pathological_Complete_Response)

row.names(annotation_for_heatmap) <- row.names(pData(eset))

dists <- as.matrix(dist(t(exp), method = "manhattan"))

rownames(dists) <- row.names(pData(eset))
hmcol <- rev(colorRampPalette(RColorBrewer::brewer.pal(9, "YlOrRd"))(255))
colnames(dists) <- NULL
diag(dists) <- NA

ann_colors <- list(
  Response = c(pCR = "black", npCR = "forestgreen"))

pheatmap(dists, col = (hmcol), 
         annotation_row = annotation_for_heatmap,
         annotation_colors = ann_colors,
         legend = TRUE, 
         show_rownames = FALSE,
         treeheight_row = 0,
         legend_breaks = c(min(dists, na.rm = TRUE), 
                         max(dists, na.rm = TRUE)), 
         legend_labels = (c("small distance", "large distance")),
         main = "Clustering heatmap RMA normalised samples")

Probe filtering

medians <- rowMedians(Biobase::exprs(eset))
man_threshold <- 4


hist_res <- hist(medians, 100, col = "cornsilk1", freq = FALSE, 
            main = "Histogram of the median intensities", 
            border = "antiquewhite4",
            xlab = "Median intensities")
abline(v = man_threshold, col = "coral4", lwd = 2)

no_of_samples <- table(pDat$Pathological_Complete_Response)

sample_cutoff <- min(no_of_samples)

idx_man_threshold <- apply(Biobase::exprs(eset), 1, function(x){ sum(x > man_threshold) >= sample_cutoff})

table(idx_man_threshold) #2708 probes do not meet the cutoff
## idx_man_threshold
## FALSE  TRUE 
##  2708 58651
print("2708 PROBES do not meet the cut-off threshold (min intensity 4 in 39 samples. \n\n DISCARDING PROBES")
## [1] "2708 PROBES do not meet the cut-off threshold (min intensity 4 in 39 samples. \n\n DISCARDING PROBES"
manual_filter <- subset(eset, idx_man_threshold)

probes <- rownames(manual_filter)

annotation <- AnnotationDbi::select(u133x3p.db,
                                    keys = probes,
                                    columns = c("SYMBOL", "GENENAME"),
                                    keytype = "PROBEID")


annotation <- subset(annotation, !is.na(SYMBOL))

## resolve multi maps. 

ann_grouped <- group_by(annotation, PROBEID)
ann_sum <- dplyr::summarize(ann_grouped, no_of_matches = n_distinct(SYMBOL))

ann_flt <- filter(ann_sum, no_of_matches > 1)

remove_id <- (featureNames(manual_filter) %in% ann_flt$PROBEID)

table(remove_id)
## remove_id
## FALSE  TRUE 
## 55983  2668
print("2668 PROBES map to multiple gene IDs %% REMOVING PROBES")
## [1] "2668 PROBES map to multiple gene IDs %% REMOVING PROBES"
final_annotation <- subset(annotation, !remove_id)
final_Obj <- subset(manual_filter, !remove_id)

fData(final_Obj)$PROBEID <- rownames(fData(final_Obj))

fData(final_Obj) <- left_join(fData(final_Obj), annotation)

rownames(fData(final_Obj)) <- fData(final_Obj)$PROBEID

Lm

response <- as.factor(pDat$Pathological_Complete_Response)

design = model.matrix( ~ 0 + response ) 
colnames(design) <- c("npCR", "pCR")

contrast_matrix <- makeContrasts(npCR_vs_pCR  = npCR-pCR, levels = design) 

fit_flt <- lmFit(final_Obj, design=design)
o <- order(fit_flt$Amean, decreasing = T)
fit_flt <- fit_flt[o,]
d <- duplicated(fit_flt$genes$SYMBOL)
fit_flt <- fit_flt[!d,]

fit_flt <- eBayes(contrasts.fit(fit_flt, contrast_matrix))

topT <- topTable(fit_flt, number = Inf)

up_reg_npcr <- subset(topT, logFC > 0.25 & P.Value < 0.05)
down_reg_npcr <- subset(topT, logFC < -0.25 & P.Value < 0.05)

Up-Regulated Genes (non-Pathological Complete Response) [Resistant to FEC]

library(DT)
DT::datatable(up_reg_npcr, rownames = FALSE, options=list(scrollX=T))

Overlapping Genes (TSC & npCR)

load("/data/projects/D_O_Connor/results.RData")

up_isec <- intersect(up$Gene, up_reg_npcr$SYMBOL)

up_isec
## [1] "TFAP2C"  "SYT1"    "EDIL3"   "AMIGO2"  "PWWP3B"  "SDR42E1" "PLN"    
## [8] "PIEZO2"