We first set up a working directory named raw_data_dir_PC
getwd();
raw_data_dir_PC = "~/All documents/UOE/PhD work/my project/raw_CEL_analysis/workspace"; # change this path to the working directory accordingly
setwd(raw_data_dir_PC);
The following pacakages are needed for our analysis for the preprocessing and DGE analysis.
#General Bioconductor packages
library(Biobase)
library(oligoClasses)
library(affy)
#BiocManager::install("affxparser")
library(affxparser)
#Annotation and data import packages
library(ArrayExpress)
library(pd.clariom.d.human)
library(clariomdhumantranscriptcluster.db)
library(clariomdhumanprobeset.db)
#Quality control and pre-processing packages
library(oligo)
library(arrayQualityMetrics)
#Analysis and statistics packages
library(limma)
library(topGO)
library(ReactomePA)
library(clusterProfiler)
#Plotting and color options packages
library(gplots)
library(ggplot2)
library(geneplotter)
library(RColorBrewer)
library(pheatmap)
#Formatting/documentation packages
library(rmarkdown)
library(BiocStyle)
library(dplyr)
library(tidyr)
#Helpers:
library(stringr)
library(matrixStats)
library(genefilter)
library(openxlsx)
library(devtools)
library(reshape2)
library(FactoMineR)
library(factoextra)
Our (Affymetrix) microarray analysis starts with raw CEL files. CEL files are the results of the processing of the raw image files using the Affymetrix software and contain estimated probe intensity values. We first import the CEL files stored in the folder from the working directory. We use a function read.celfiles from the oligo package to import our raw CEL files into R by using the following codes:
CELs_PC <- list.celfiles("/Users/zhuofanmou/All documents/UOE/PhD work/my project/raw_CEL_analysis/workspace", full.names = TRUE) # change this path to where the CEL files are stored
raw_data_PC <- oligo::read.celfiles(CELs_PC) # read in the CEL files
stopifnot(validObject(raw_data_PC))
The ExpressionSet named raw_data_PC is now created, then we need to specify our prostate sample information as phenoData in the ExpressionSet. According to the names of the CEL files (i.e., colnames(exprs(raw_data_PC))), we create the phenoData which contains the columns of our interest (i.e., sample ID and tumor status). The information is shown below:
Biobase::pData(raw_data_PC)$Sample.ID <-
c('10A1','10E5','12A1','12E5','13A1','13E5','14A1','14E5','15A1_2','15A1','15D4_2','15D4','17A1','17B2','7A1','7K11','8A1','8G7','904','9A1') # adding patients label
Biobase::pData(raw_data_PC)$Cancer.status <-
c('T','B','T','B','T','B','T','B','T','T','B','B','B','T','B','T','B','T','B','T') # adding Cancer status
head(Biobase::pData(raw_data_PC))
## index Sample.ID Cancer.status
## DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL 1 10A1 T
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL 2 10E5 B
## DC_UKB_LH_12A1_T_Sample_2_(Clariom_D_Human).CEL 3 12A1 T
## DC_UKB_LH_12E5_B_Sample_1_(Clariom_D_Human).CEL 4 12E5 B
## DC_UKB_LH_13A1_T_Sample_20_(Clariom_D_Human).CEL 5 13A1 T
## DC_UKB_LH_13E5_B_Sample_19_(Clariom_D_Human).CEL 6 13E5 B
The major components of our ExpressionSet are now completed. We can use the function exprs(raw_data_PC) and pData(raw_data_PC) to access the expression matrix and phenoData respectively. The heads of the results are shown below:
head(exprs(raw_data_PC))
## DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## 1 3896
## 2 328
## 3 4157
## 4 256
## 5 112
## 6 193
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL
## 1 3260
## 2 112
## 3 3455
## 4 118
## 5 133
## 6 133
## DC_UKB_LH_12A1_T_Sample_2_(Clariom_D_Human).CEL
## 1 4821
## 2 159
## 3 4571
## 4 147
## 5 102
## 6 237
## DC_UKB_LH_12E5_B_Sample_1_(Clariom_D_Human).CEL
## 1 4224
## 2 84
## 3 4050
## 4 67
## 5 88
## 6 244
## DC_UKB_LH_13A1_T_Sample_20_(Clariom_D_Human).CEL
## 1 3416
## 2 176
## 3 3530
## 4 134
## 5 158
## 6 175
## DC_UKB_LH_13E5_B_Sample_19_(Clariom_D_Human).CEL
## 1 3339
## 2 84
## 3 3137
## 4 97
## 5 102
## 6 209
## DC_UKB_LH_14A1_T_Sample_18_(Clariom_D_Human).CEL
## 1 3717
## 2 126
## 3 3881
## 4 95
## 5 105
## 6 224
## DC_UKB_LH_14E5_B_Sample_17_(Clariom_D_Human).CEL
## 1 2421
## 2 77
## 3 3162
## 4 91
## 5 107
## 6 132
## DC_UKB_LH_15A1_T_2_Sample_14_(Clariom_D_Human).CEL
## 1 4938
## 2 111
## 3 4613
## 4 108
## 5 111
## 6 155
## DC_UKB_LH_15A1_T_Sample_16_(Clariom_D_Human).CEL
## 1 3043
## 2 88
## 3 3240
## 4 91
## 5 78
## 6 130
## DC_UKB_LH_15D4_B_2_Sample_13_(Clariom_D_Human).CEL
## 1 3442
## 2 104
## 3 2886
## 4 98
## 5 144
## 6 130
## DC_UKB_LH_15D4_B_Sample_15_(Clariom_D_Human).CEL
## 1 3561
## 2 102
## 3 3333
## 4 87
## 5 139
## 6 196
## DC_UKB_LH_17A1_B_Sample_11_(Clariom_D_Human).CEL
## 1 2934
## 2 284
## 3 3882
## 4 241
## 5 195
## 6 78
## DC_UKB_LH_17B2_T_Sample_12_(Clariom_D_Human).CEL
## 1 4458
## 2 130
## 3 4099
## 4 80
## 5 141
## 6 104
## DC_UKB_LH_7A1_B_Sample_9_(Clariom_D_Human).CEL
## 1 2741
## 2 106
## 3 3196
## 4 87
## 5 141
## 6 100
## DC_UKB_LH_7K11_T_Sample_10_(Clariom_D_Human).CEL
## 1 5500
## 2 110
## 3 5242
## 4 240
## 5 176
## 6 211
## DC_UKB_LH_8A1_B_Sample_7_(Clariom_D_Human).CEL
## 1 5150
## 2 303
## 3 4874
## 4 180
## 5 112
## 6 98
## DC_UKB_LH_8G7_T_Sample_8_(Clariom_D_Human)_2.CEL
## 1 3899
## 2 91
## 3 3606
## 4 95
## 5 85
## 6 179
## DC_UKB_LH_904_B_Sample_5_(Clariom_D_Human).CEL
## 1 4286
## 2 130
## 3 3923
## 4 82
## 5 129
## 6 140
## DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL
## 1 2448
## 2 236
## 3 3113
## 4 166
## 5 115
## 6 81
head(pData(raw_data_PC))
## index Sample.ID Cancer.status
## DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL 1 10A1 T
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL 2 10E5 B
## DC_UKB_LH_12A1_T_Sample_2_(Clariom_D_Human).CEL 3 12A1 T
## DC_UKB_LH_12E5_B_Sample_1_(Clariom_D_Human).CEL 4 12E5 B
## DC_UKB_LH_13A1_T_Sample_20_(Clariom_D_Human).CEL 5 13A1 T
## DC_UKB_LH_13E5_B_Sample_19_(Clariom_D_Human).CEL 6 13E5 B
Note that there is one patient labelled with 15 which has two sample replicates, so we remove the first replicate (i.e., 15A1 and 15D4) so that we keep only one patient for every paired malignant/benign samples.
raw_data_15rem_PC = raw_data_PC[ , -c(10,12)]
raw_data_PC = raw_data_15rem_PC
Finally, the ExpressionSet raw_data_PC, apart from the missing of the featureData which we will specify it when we perform the probeset annotation later, is ready to be used for our following analysis.
raw_data_PC
## HTAFeatureSet (storageMode: lockedEnvironment)
## assayData: 6892960 features, 18 samples
## element names: exprs
## protocolData
## rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
## DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
## varLabels: exprs dates
## varMetadata: labelDescription channel
## phenoData
## rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
## DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
## varLabels: index Sample.ID Cancer.status
## varMetadata: labelDescription channel
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: pd.clariom.d.human
In microarray and other high-throughput technologies, it is important to assess the quality of raw data, also known as quality control. It helps us to check the existence of any sample outliers, to see if the data are clustered as expected (based on the tumor status: T=malignant/tumor or B=benign/normal) and most importantly whether or not the data need to be normalized so that the data from different samples are comparable. We can obtain the expression matrix stored in the ExpressionSet by calling exprs(raw_data_PC). The rows of the expression matrix represent the microarray probes, while the columns represent sample arrays of paired malignant/benign tumor status of every patient, respectively (i.e., 18 columns in total as we have 9 patients).
Generally, there are many sources of noise in microarray experiments:
Systematic differences between the samples that are due to noise rather than true biological variability should be removed in order to make biologically meaningful conclusions about the data. Thus, a proper normalisation method is usually required.
In this report, we will apply Robust multichip average (RMA) algorithm to our raw data. The RMA convolution model can be integrated as the following procedures:
Note that we can summarize the probes on the levels of known transcripts (by setting target = core in the function rma from package oligo).
We first take the log2 of Biobase::exprs(raw_data), as expression data is more convenient to be analyzed on a logarithmic scale.
exp_raw_PC <- log2(Biobase::exprs(raw_data_PC)) # log2 transform
We also perform the series of steps of RMA algorithm by calling the function rma from oligo. Then the RMA calibrated ExpressionSet is named palmieri_eset_norm_PC. Note that here we retain all probes available (i.e., no pre-filtered) for normalisation.
palmieri_eset_norm_PC <- oligo::rma(raw_data_PC, target = "core") # RMA on the raw data
## Background correcting
## Normalizing
## Calculating Expression
palmieri_eset_norm_PC
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 138745 features, 18 samples
## element names: exprs
## protocolData
## rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
## DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
## varLabels: exprs dates
## varMetadata: labelDescription channel
## phenoData
## rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
## DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
## varLabels: index Sample.ID Cancer.status
## varMetadata: labelDescription channel
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: pd.clariom.d.human
We can see that there are overall 138745 features summarised in the ExpressionSet palmieri_eset_norm_PC and that means we have 138745 rows of transcript clusters/genes in the expression data exprs(palmieri_eset_norm_PC) .
Now, we will assess the data quality by using various diagnostic plots (e.g., boxplots, MA plots and density plots) for both the log2-transformed raw data and the RMA calibrated data.
One of the most used quality assessments (on the raw data) is the pairwise MA plots. It has log-ratio, M, on the horizontal axis, versus average log-intensities, A, on the vertical axis. In other words, if we have two samples \(i\) and \(j\) together with \(I_{i}\) and \(I_{j}\) representing the gene expression values in sample \(i\) and \(j\), respectively, then based on \(log_{2}\) scale, we have: \[M_{i,j}=log_{2}(I_{i})-log_{2}(I_{j})=log_{2}(I_{i}/I_{j})\]
difference in \(log_{2}\) expression or \(log_{2}\)-ratio between 2 samples
\[A_{i,j}= \frac{1}{2}[log_{2}(I_{i})+log_{2}(I_{j})]=log_{2}(\sqrt{I_{i} \times I_{j}})\]
average log2 expression between two samples.
Now, we produce MA plots for four randomly picked samples for the raw data (left) and RMA calibrated data(right)
# For raw data
affy::mva.pairs(pm(raw_data_PC)[,c(4,8,12,18)],plot.method="normal", main = "MAplot for Prostate raw data ") # Note that we only consider the perfect matched probes
# For RMA data
exprs_RMA_PC = exprs(palmieri_eset_norm_PC)
affy::mva.pairs(exprs_RMA_PC[,c(4,8,12,18)],plot.method="normal", main = "MAplot for prostate data after RMA ")
Ideally, the cloud of data points in the MA-plot should be centered around M=0 (blue line). This is because we assume that the majority of the genes is not DE and that the number of upregulated genes is similar to the number of down-regulated genes. Additionally, the variability of the M values should be similar across different array-array combinations. As we can see from left that the spread of the point cloud increases with the average intensity: the loess curve (the fitted red line) for the genes in each pairwise plot moves further and further away (or up and down) from M=0 when A increases, which indicate a normalization is needed to remove the dependency. Then the RMA algorithm was applied to normalise the data and the results are shown in the right. It now shows a much more symmetric and even spread of the data points indicating that the dependence of the variability on the expression level is not as strong as it was before normalization.
One quality control check is to plot the distribution of log base 2 intensities of perfect match probes for comparison of probe intensity behavior between different arrays (\(log_{2}(PM_{ij})\) for array i and probe j). For the raw data, there are clear differences in shape of the sample distributions (see left), indicating that a proper normalization is required. And after the we apply the RMA normalisation, the shapes of the sample distributions are now very similar (see right), meaning that the data from each sample are now adjusted and comparable.
# For raw data
pmexp_PC = pm(raw_data_PC); #or can use raw_data_Islet; Note that we only consider the perfect matched probes
sampleNames_PC = vector()
logs_PC = vector()
for (i in 1:18)
{
sampleNames_PC = c(sampleNames_PC,rep(as.character(pData(raw_data_PC)[i,2]),dim(pmexp_PC)[1])) #Note: here sample names in 'pData(raw_data_Islet)[i,2]' must be character: 'as.character(pData(raw_data_Islet)[i,2])'
logs_PC = c(logs_PC,log2(pmexp_PC[,i]))
}
logData_PC = data.frame(logInt=logs_PC,sampleName=sampleNames_PC)
dataHist2_PC = ggplot(logData_PC, aes(logInt, colour = sampleName))
dataHist2_PC + geom_density()+ ggtitle("Density plot of the log2-intensitites raw data")
# For RMA data
exp_palmieri_PC <- Biobase::exprs(palmieri_eset_norm_PC)
exp_RMA_PC = exp_palmieri_PC
sampleNames_RMA_PC = vector()
logs_RMA_PC = vector()
for (i in 1:18)
{
sampleNames_RMA_PC = c(sampleNames_RMA_PC,rep(as.character(pData(palmieri_eset_norm_PC)[i,2]),dim(exp_RMA_PC)[1])) #Note: here sample names in 'pData(raw_data_Islet)[i,2]' must be character: 'as.character(pData(raw_data_Islet)[i,2])'
logs_RMA_PC = c(logs_RMA_PC,exp_RMA_PC[,i])
}
logData_RMA_PC = data.frame(logInt=logs_RMA_PC,sampleName=sampleNames_RMA_PC)
dataHist2_RMA_PC = ggplot(logData_RMA_PC, aes(logInt, colour = sampleName))
dataHist2_RMA_PC + geom_density() + ggtitle("Density plot of the RMA calibrated prostate data")
Both boxplots and density plots show the same differences in probe intensity behavior between sample arrays. In order to perform meaningful statistical analysis and inferences from the data, we need to ensure that all the samples are comparable. To examine and compare the overall distribution of log transformed perfect match intensities between the samples we can use a density plot, but we can get a clearer view with a box plot as it shows:
We note that by using boxplot function from oligo, we do not need to previoulsy log2 transform the expression data. Instead, the function uses ExpressionSet as input directly.
#For raw data
oligo::boxplot(raw_data_PC, target = "core", col = FALSE,
main = "Boxplot of log2-intensitites for Prostate raw data", xlab = "Sample Number", ylab = "Log2-intensitites", names = pData(raw_data_PC)[ , 2])
# For RMA data
oligo::boxplot(palmieri_eset_norm_PC, target = "core", col = FALSE,
main = "Boxplot of RMA calibrated prostate data", xlab = "Sample Number", ylab = "Log2-intensitites", names = pData(palmieri_eset_norm_PC)[ , 2])
As we can see from above that after normalization, none of the samples stand out from the rest. The different arrays should now have a very comparable median expression level. Also, the scale of the boxes should be very comparable indicating that the spread of the intensity values on the different arrays is equalized.
Now, to check whether the overall variability of the samples reflects their grouping, we can perform a Principal Component Analysis (PCA) and hierarchical clustering dendrogram. In other words, we can check if the benign tumor (B) samples are homogenous and distinguishable from samples of the other malignant tumor (T) group.
# For raw data
PCA_raw_PC <- prcomp(t(exp_raw_PC), scale. = FALSE)
percentVar_PC <- round(100*PCA_raw_PC$sdev^2/sum(PCA_raw_PC$sdev^2),1)
sd_ratio_PC <- sqrt(percentVar_PC[2] / percentVar_PC[1])
dataGG_PC <- data.frame(PC1 = PCA_raw_PC$x[,1], PC2 = PCA_raw_PC$x[,2],
Tumor_status = pData(raw_data_PC)$Cancer.status)
ggplot(dataGG_PC, aes(PC1, PC2)) +
geom_point(aes(colour = Tumor_status)) +
geom_text(aes(label =pData(raw_data_PC)[ , 2] ), hjust = -0.1, vjust = -0.3) + # add point names
ggtitle("PCA plot of the log-transformed raw expression data") +
xlab(paste0("PC1, VarExp: ", percentVar_PC[1], "%")) +
xlim(-1000,2000)+
ylim(-1000,1000)+
ylab(paste0("PC2, VarExp: ", percentVar_PC[2], "%")) +
theme(plot.title = element_text(hjust = 0.5))+
coord_fixed(ratio = sd_ratio_PC) +
scale_color_manual(values = c("#666666", "#3399FF"))
# For RMA data
exp_palmieri_PC <- Biobase::exprs(palmieri_eset_norm_PC)
PCA_RMA_PC <- prcomp(t(exp_palmieri_PC), scale = FALSE)
percentVar_RMA_PC <- round(100*PCA_RMA_PC$sdev^2/sum(PCA_RMA_PC$sdev^2),1)
sd_ratio_RMA_PC <- sqrt(percentVar_RMA_PC[2] / percentVar_RMA_PC[1])
dataGG_RMA_PC <- data.frame(PC1 = PCA_RMA_PC$x[,1], PC2 = PCA_RMA_PC$x[,2],
Tumor_status = Biobase::pData(palmieri_eset_norm_PC)$Cancer.status)
ggplot(dataGG_RMA_PC, aes(PC1, PC2)) +
geom_point(aes(colour = Tumor_status)) +
geom_text(aes(label =pData(palmieri_eset_norm_PC)[ , 2] ), hjust = -0.1, vjust = -0.3) + # add point names
ggtitle("PCA plot of the RMA calibrated, summarized data") +
xlab(paste0("PC1, VarExp: ", percentVar_RMA_PC[1], "%")) +
xlim(-200,400)+
ylim(-200,200)+
ylab(paste0("PC2, VarExp: ", percentVar_RMA_PC[2], "%")) +
theme(plot.title = element_text(hjust = 0.5)) +
coord_fixed(ratio = sd_ratio_RMA_PC) +
scale_color_manual(values = c("#666666", "#3399FF"))
We also plot the dendrogram to show the clustering of the sample after RMA calibrated (ideally, we want the cancer status labelled with B and T are clustered seperatly) (Q: Do we usually plot sample-to-sample dendro for paired dataset ot is it worthless?)
Biobase::pData(palmieri_eset_norm_PC)$Sample.ID_for_dendrogram = c('10A1_T','10E5_B','12A1_T','12E5_B','13A1_T','13E5_B','14A1_T','14E5_B','15A1_2_T','15D4_2_B','17A1_B','17B2_T','7A1_B','7K11_T','8A1_B','8G7_T','904_B','9A1_T')#c('10A1_T','10E5_B','12A1_T','12E5_B','13A1_T','13E5_B','14A1_T','14E5_B','15A1_2_T','15D4_2_B','17A1_B','17B2_T','7A1_B','7K11_T','8A1_B','8G7_T','904_B','9A1_T')
clustering <- exp_palmieri_PC %>% t() %>% dist(method = "euclidean") %>%
hclust(method = "complete")
plot(clustering, labels = (pData(palmieri_eset_norm_PC)$Sample.ID_for_dendrogram),main = "Cluster dendrogram of 18 samples (Euclidean)")
The PCA plots indicates that the PC1 differentiates between the tumor types, although the data points are still noisy.
The dendrogram confirms that benign tumor samples such as 7A1, 904,12E5 and 14E5 are misclustered with those of malignant tumor samples.
(Q: Any possible reasons?: Patients 7&12 are under specific treatments, patients 9&14 are the only active smokers among the patients?)
Now, we add ‘featureData’, i.e. annotation information of the transcript cluster identifiers stored in the ExpressionSet.
There are two ways of annotating:
using the annotation that is assembled by the BioC core team using data from public repositories, either on the levels of probe set regions ‘clariomdhumanprobeset.db’ or transcripts ‘clariomdhumantranscriptcluster.db’. By doing this way, there are overall 24406 out of 138745 rows of the probset/genes actually annotated.
using the annotation provided by Affymetrix (as available on their NetAffx site), which is contained in the Platform Design (pd) package ‘pd.clariom.d.human’. By doing this way, there are overall 86161 out of 138745 rows of the probset/genes actually annotated.
The second approach will be used as it gives ~60% of the total transcript clusters annotated.
In order to annotate the probeset, we will use the package affycoretools and apply the function annotateEset; the feature information is then automatically stored to the ‘featureData’ of the ExpressionSet palmieri_eset_norm_PC
library(affycoretools)
palmieri_eset_norm_PC <- annotateEset(palmieri_eset_norm_PC, pd.clariom.d.human) # annotating the features with annotation package 'pd.clariom.d.human'
#palmieri_eset_norm_PC <- annotateEset(palmieri_eset_norm_PC, clariomdhumantranscriptcluster.db) # this line of code is for the First approach
Now, to access the featureData, we can use
head(fData(palmieri_eset_norm_PC))
## PROBEID ID SYMBOL GENENAME
## AFFX-BkGr-GC03_st AFFX-BkGr-GC03_st <NA> <NA> <NA>
## AFFX-BkGr-GC04_st AFFX-BkGr-GC04_st <NA> <NA> <NA>
## AFFX-BkGr-GC05_st AFFX-BkGr-GC05_st <NA> <NA> <NA>
## AFFX-BkGr-GC06_st AFFX-BkGr-GC06_st <NA> <NA> <NA>
## AFFX-BkGr-GC07_st AFFX-BkGr-GC07_st <NA> <NA> <NA>
## AFFX-BkGr-GC08_st AFFX-BkGr-GC08_st <NA> <NA> <NA>
However, as we can see above there are many rows of probesets which are not annotated (i.e., NAs).
To see how many rows of transcrip clusters are truly annotated, we can use the following code
length(which(!is.na(fData(palmieri_eset_norm_PC)$SYMBOL)))
## [1] 86161
There are 86161 out of 138745 rows of the probset/genes annotated.
Now, we perform the following filtering steps priori to futher analysis such as differential gene expression analysis:
Now, one of the strategies we can use in order to reduce the number of gene input is to filter out lowly expressed genes. Microarray data commonly show a large number of probes in the background intensity range. These probes also do not change much across arrays. Hence they combine a low variance with a low intensity. Thus, they could end up being detected as differentially expressed although they are barely above the “detection” limit and are not very informative in general.
For intensity-based filtering, we calculate the row-wise means/averages from the expression data, as they represent the transcript means, and assign them to palmieri_means_PC. We also apply the function shorth deom the genefilter package in order to decide the expression cutoff value. The shorth is the shortest interval that covers half of the values in x (i.e., the mean vector for each probe across the whole samples). This function calculates the mean of the x values that lie in the shorth. This was proposed by Andrews (1972) as a robust estimator of location.
palmieri_means_PC <- rowMeans(Biobase::exprs(palmieri_eset_norm_PC))
hist_res_PC <- hist(palmieri_means_PC, 100, col = "azure3", freq = FALSE,
main = "Histogram of the mean intensities",
border = "antiquewhite4",
xlab = "Mean intensities")
sh_cutoff <- shorth(palmieri_means_PC) #https://support.bioconductor.org/p/69467/#69744); shorth cutoff: 2.089
# histgram with abline
man_threshold_PC <- sh_cutoff #!NEED DISCUSSSION WITH LORNA
hist_res_PC <- hist(palmieri_means_PC, 100, col = "azure3", freq = FALSE,
main = "Histogram of the mean intensities with cutoff line",
border = "antiquewhite4",
xlab = "Mean intensities")
abline(v = man_threshold_PC, col = "coral4", lwd = 2)
filtered.set <- palmieri_eset_norm_PC[palmieri_means_PC >=sh_cutoff,] # number of probes passed the shorth cutoff
dim(filtered.set)
## Features Samples
## 98548 18
no_of_samples_PC <- table(paste0(pData(palmieri_eset_norm_PC)$Cancer.status)) # number of samples
no_of_samples_PC
##
## B T
## 9 9
samples_cutoff_PC <- min(no_of_samples_PC) # select the group with minimum number of samples (9 in our case)
idx_man_threshold_PC <- apply(Biobase::exprs(palmieri_eset_norm_PC), 1,function(x){sum(x > man_threshold_PC) >=samples_cutoff_PC})
table(idx_man_threshold_PC) # number of probes that passed the short cutoff and in at least 50% samples
## idx_man_threshold_PC
## FALSE TRUE
## 42733 96012
palmieri_manfiltered_PC <- subset(palmieri_eset_norm_PC, idx_man_threshold_PC) # filter out the probes do not meet the criteria
The distribution of the histograms above shows an enrichment of low means on the left-hand side, and these represent lowly expressed genes which we want to filter out. Q: is the cutoff using shorth ok?
Many transcript-cluster identifiers will map to multiple gene symbols, i.e. they can’t be unambigously assigned.
We first get rid of the unannotated transcript clusters
anno_palmieri_PC <- fData(palmieri_manfiltered_PC)
anno_palmieri_PC <- subset(anno_palmieri_PC, !is.na(SYMBOL))
dim(anno_palmieri_PC)
## [1] 64292 4
anno_palmieri_PC[1:6,1:4]
## PROBEID ID SYMBOL
## TC0100006432.hg.1 TC0100006432.hg.1 NR_046018 DDX11L1
## TC0100006433.hg.1 TC0100006433.hg.1 spopoybu.aAug10-unspliced spopoybu
## TC0100006434.hg.1 TC0100006434.hg.1 NR_036267 MIR1302-10
## TC0100006436.hg.1 TC0100006436.hg.1 ENST00000492842 OR4G2P
## TC0100006437.hg.1 TC0100006437.hg.1 NM_001005484 OR4F5
## TC0100006438.hg.1 TC0100006438.hg.1 ENST00000442987 CICP27
## GENENAME
## TC0100006432.hg.1 DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1
## TC0100006433.hg.1 Transcript Identified by AceView
## TC0100006434.hg.1 microRNA 1302-10
## TC0100006436.hg.1 olfactory receptor, family 4, subfamily G, member 2 pseudogene
## TC0100006437.hg.1 olfactory receptor, family 4, subfamily F, member 5
## TC0100006438.hg.1 capicua transcriptional repressor pseudogene 27 [Source:HGNC Symbol;Acc:HGNC:48835]
Then we check how many rows that have the same probeID but mapping to different genes; There is no probesetID/transcript-cluster identifier mapping to different genes when using the annotation package ‘pd.clariom.d.human’ (i.e., each probe maps to one gene but not necessary to be a distinct gene). However, on the other hand, it is surely the case when applying the annotation package ‘clariomdhumantranscriptcluster.db’ .
anno_grouped_PC <- group_by(anno_palmieri_PC, PROBEID) #group by the probesetIDs
dim(anno_grouped_PC)
## [1] 64292 4
anno_summarized_PC <-
dplyr::summarize(anno_grouped_PC, no_of_matches = n_distinct(SYMBOL)) #list out all the distinct IDs and the corresponding frequency
head(anno_summarized_PC)
## # A tibble: 6 x 2
## PROBEID no_of_matches
## <chr> <int>
## 1 TC0100006432.hg.1 1
## 2 TC0100006433.hg.1 1
## 3 TC0100006434.hg.1 1
## 4 TC0100006436.hg.1 1
## 5 TC0100006437.hg.1 1
## 6 TC0100006438.hg.1 1
anno_filtered_PC <- filter(anno_summarized_PC, no_of_matches > 1) # filter for the IDs that have mapping greater than 1
head(anno_filtered_PC)
## # A tibble: 0 x 2
## # … with 2 variables: PROBEID <chr>, no_of_matches <int>
probe_stats_PC <- anno_filtered_PC #With dim(probe_stats), we can see how many probes have been mapped to multiple genes.
nrow(probe_stats_PC)
## [1] 0
ids_to_exlude_PC <- (featureNames(palmieri_manfiltered_PC) %in% probe_stats_PC$PROBEID) # matching the probes that need to be removed
table(ids_to_exlude_PC)
## ids_to_exlude_PC
## FALSE
## 96012
palmieri_final_PC <- subset(palmieri_manfiltered_PC, !ids_to_exlude_PC) #filter out the probes having multiple mappings; and creat the final ExpressionSet
validObject(palmieri_final_PC)
## [1] TRUE
head(anno_palmieri_PC)
## PROBEID ID SYMBOL
## TC0100006432.hg.1 TC0100006432.hg.1 NR_046018 DDX11L1
## TC0100006433.hg.1 TC0100006433.hg.1 spopoybu.aAug10-unspliced spopoybu
## TC0100006434.hg.1 TC0100006434.hg.1 NR_036267 MIR1302-10
## TC0100006436.hg.1 TC0100006436.hg.1 ENST00000492842 OR4G2P
## TC0100006437.hg.1 TC0100006437.hg.1 NM_001005484 OR4F5
## TC0100006438.hg.1 TC0100006438.hg.1 ENST00000442987 CICP27
## GENENAME
## TC0100006432.hg.1 DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1
## TC0100006433.hg.1 Transcript Identified by AceView
## TC0100006434.hg.1 microRNA 1302-10
## TC0100006436.hg.1 olfactory receptor, family 4, subfamily G, member 2 pseudogene
## TC0100006437.hg.1 olfactory receptor, family 4, subfamily F, member 5
## TC0100006438.hg.1 capicua transcriptional repressor pseudogene 27 [Source:HGNC Symbol;Acc:HGNC:48835]
fData(palmieri_final_PC)$PROBEID <- rownames(fData(palmieri_final_PC))
fData(palmieri_final_PC) <- left_join(fData(palmieri_final_PC), anno_palmieri_PC)
rownames(fData(palmieri_final_PC)) <- fData(palmieri_final_PC)$PROBEID
validObject(palmieri_final_PC)
## [1] TRUE
length(which(!is.na(fData(palmieri_final_PC)$SYMBOL))) # check for number of truly annotated probesets
## [1] 64292
boxplot(palmieri_final_PC)
As nrow(probe_stats_PC) gives 0,which means that each of the transcrip-cluster identifiers annotated maps to only one gene (N.B., it does not mean that we are safe for the case where we have multiple transcrip-cluster identifiers that mapping to a same gene; and we will deal with this later).
f_filbyNA_PC <- na.omit(fData(palmieri_final_PC)) #filter out the NA rows
ids_to_exlude_NA_PC <- (featureNames(palmieri_final_PC) %in% f_filbyNA_PC$PROBEID) #match the feature names from ExpressionSet to the annotated probesetIDs that are left; it is a logical object
palmieri_final_PC = subset(palmieri_final_PC, ids_to_exlude_NA_PC) # subset the ExpressionSet for which includes the features that are only truly annotated
palmieri_final_PC
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 64292 features, 18 samples
## element names: exprs
## protocolData
## rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
## DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
## varLabels: exprs dates
## varMetadata: labelDescription channel
## phenoData
## rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
## DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
## varLabels: index Sample.ID Cancer.status Sample.ID_for_dendrogram
## varMetadata: labelDescription channel
## featureData
## featureNames: TC0100006432.hg.1 TC0100006433.hg.1 ...
## TSUnmapped00000826.hg.1 (64292 total)
## fvarLabels: PROBEID ID SYMBOL GENENAME
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: pd.clariom.d.human
Now, we have the ExpressionSet palmieri_final_PC which contains only those truly annotated features/transcript-cluster identifiers/genes/probeset IDs/rows.
We filter out the genes/transcripts that are identified by the gene library AceView because nothing is known about these transcripts and some of them are only predicted so are not particularly useful.
df_Aceview = fData(palmieri_final_PC)
no_of_AceView = df_Aceview[grepl("AceView", df_Aceview$GENENAME),]
head(no_of_AceView)
## PROBEID ID SYMBOL
## TC0100006433.hg.1 TC0100006433.hg.1 spopoybu.aAug10-unspliced spopoybu
## TC0100006443.hg.1 TC0100006443.hg.1 storubu.aAug10-unspliced storubu
## TC0100006446.hg.1 TC0100006446.hg.1 boyrabu.aAug10-unspliced boyrabu
## TC0100006462.hg.1 TC0100006462.hg.1 jerfobu.aAug10 jerfobu
## TC0100006477.hg.1 TC0100006477.hg.1 sarmeybo.aAug10-unspliced sarmeybo
## TC0100006478.hg.1 TC0100006478.hg.1 turorbu.aAug10-unspliced turorbu
## GENENAME
## TC0100006433.hg.1 Transcript Identified by AceView
## TC0100006443.hg.1 Transcript Identified by AceView
## TC0100006446.hg.1 Transcript Identified by AceView
## TC0100006462.hg.1 Transcript Identified by AceView
## TC0100006477.hg.1 Transcript Identified by AceView
## TC0100006478.hg.1 Transcript Identified by AceView
length(no_of_AceView$PROBEID)
## [1] 20849
f_filbyAceView_PC = df_Aceview[!grepl("AceView", df_Aceview$GENENAME),]
ids_to_match_AceView_PC <- (featureNames(palmieri_final_PC) %in% f_filbyAceView_PC$PROBEID)
palmieri_final_PC = subset(palmieri_final_PC, ids_to_match_AceView_PC)
palmieri_final_PC
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 43443 features, 18 samples
## element names: exprs
## protocolData
## rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
## DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
## varLabels: exprs dates
## varMetadata: labelDescription channel
## phenoData
## rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
## DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
## varLabels: index Sample.ID Cancer.status Sample.ID_for_dendrogram
## varMetadata: labelDescription channel
## featureData
## featureNames: TC0100006432.hg.1 TC0100006434.hg.1 ...
## TSUnmapped00000826.hg.1 (43443 total)
## fvarLabels: PROBEID ID SYMBOL GENENAME
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: pd.clariom.d.human
Next, we also filter out the genes identified as pseudogenes.
df_pseudo = fData(palmieri_final_PC)
no_of_pseudo = df_pseudo[grepl("pseudogene", df_pseudo$GENENAME),]
head(no_of_pseudo)
## PROBEID ID SYMBOL
## TC0100006436.hg.1 TC0100006436.hg.1 ENST00000492842 OR4G2P
## TC0100006438.hg.1 TC0100006438.hg.1 ENST00000442987 CICP27
## TC0100006450.hg.1 TC0100006450.hg.1 ENST00000437905 WBP1LP7
## TC0100006451.hg.1 TC0100006451.hg.1 ENST00000432723 CICP7
## TC0100006453.hg.1 TC0100006453.hg.1 ENST00000416931 MTND1P23
## TC0100006454.hg.1 TC0100006454.hg.1 ENST00000457540 MTND2P28
## GENENAME
## TC0100006436.hg.1 olfactory receptor, family 4, subfamily G, member 2 pseudogene
## TC0100006438.hg.1 capicua transcriptional repressor pseudogene 27 [Source:HGNC Symbol;Acc:HGNC:48835]
## TC0100006450.hg.1 WW domain binding protein 1-like pseudogene 7 [Source:HGNC Symbol;Acc:HGNC:43955]
## TC0100006451.hg.1 capicua transcriptional repressor pseudogene 7 [Source:HGNC Symbol;Acc:HGNC:37756]
## TC0100006453.hg.1 mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 1 pseudogene 23 [Source:HGNC Symbol;Acc:HGNC:42092]
## TC0100006454.hg.1 mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 2 pseudogene 28 [Source:HGNC Symbol;Acc:HGNC:42129]
length(no_of_pseudo$PROBEID)
## [1] 6959
f_filbypseudogene_PC = df_pseudo[!grepl("pseudogene", df_pseudo$GENENAME),]
ids_to_match_pseudogene_PC <- (featureNames(palmieri_final_PC) %in% f_filbypseudogene_PC$PROBEID)
palmieri_final_PC = subset(palmieri_final_PC, ids_to_match_pseudogene_PC)
palmieri_final_PC
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 36484 features, 18 samples
## element names: exprs
## protocolData
## rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
## DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
## varLabels: exprs dates
## varMetadata: labelDescription channel
## phenoData
## rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
## DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
## varLabels: index Sample.ID Cancer.status Sample.ID_for_dendrogram
## varMetadata: labelDescription channel
## featureData
## featureNames: TC0100006432.hg.1 TC0100006434.hg.1 ...
## TSUnmapped00000826.hg.1 (36484 total)
## fvarLabels: PROBEID ID SYMBOL GENENAME
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: pd.clariom.d.human
There are some probes having their GENENAME as ‘- - -’, we also remove them. ?
# remove '---' genenames
df_threebar = fData(palmieri_final_PC)
no_of_threebar = df_threebar[grepl("---", df_threebar$GENENAME),]
head(no_of_threebar)
## PROBEID ID SYMBOL GENENAME
## TC0100006440.hg.1 TC0100006440.hg.1 ENST00000496488.1 RP11-34P13.9 ---
## TC0100006444.hg.1 TC0100006444.hg.1 ENST00000412666.1 RP4-669L17.2 ---
## TC0100006452.hg.1 TC0100006452.hg.1 ENST00000423796.1 RP5-857K21.2 ---
## TC0100006455.hg.1 TC0100006455.hg.1 uc057avw.1 AC114498.1 ---
## TC0100006464.hg.1 TC0100006464.hg.1 ENST00000422528.1 RP11-206L10.4 ---
## TC0100006466.hg.1 TC0100006466.hg.1 ENST00000358533.2 RP11-206L10.9 ---
length(no_of_threebar$PROBEID)
## [1] 11880
f_filbythree_PC = df_threebar[!grepl("---", df_threebar$GENENAME),]
ids_to_match_threebar_PC <- (featureNames(palmieri_final_PC) %in% f_filbythree_PC$PROBEID)
palmieri_final_PC = subset(palmieri_final_PC, ids_to_match_threebar_PC)
palmieri_final_PC
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 24604 features, 18 samples
## element names: exprs
## protocolData
## rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
## DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
## varLabels: exprs dates
## varMetadata: labelDescription channel
## phenoData
## rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
## DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
## varLabels: index Sample.ID Cancer.status Sample.ID_for_dendrogram
## varMetadata: labelDescription channel
## featureData
## featureNames: TC0100006432.hg.1 TC0100006434.hg.1 ...
## TSUnmapped00000826.hg.1 (24604 total)
## fvarLabels: PROBEID ID SYMBOL GENENAME
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: pd.clariom.d.human
It is usually the case that we can have multiple probes/transcripts that are mapping to the same genes, for example
fData = fData(palmieri_final_PC)
n_occur <- data.frame(table(fData$SYMBOL))
n_occur<-n_occur[n_occur$Freq > 1,]
head(n_occur)
## Var1 Freq
## 1 5_8S_rRNA 3
## 2 5S_rRNA 11
## 3 7SK 3
## 47 ABCA9-AS1 2
## 55 ABCC1 2
## 108 ABO 2
We will collapse the probes into one for the same gene based their (max) mean expression aross teh whole samples.We achieve this by using the following codes:
ids_PC = fData(palmieri_final_PC)[,c(1,3)] ## only need the probesetIDs (1st column) and gene symbols (3rd column)
ids_PC = ids_PC[ids_PC[,2] != '',] ## get rid of 'NA' gene symbols 86161
ID2gene = ids_PC
exprSet = exprs(palmieri_final_PC)
{
MAX = by(exprSet, ID2gene[,2],
function(x) rownames(x)[ which.max(rowMeans(x))])
MAX = as.character(MAX)
exprSet = exprSet[rownames(exprSet) %in% MAX,]
#rownames( exprSet ) = ID2gene[ match( rownames( exprSet ), ID2gene[ , 1 ] ), 2 ] # rownames to gene symbol
}
dim(exprSet)
## [1] 22165 18
ids_fr_mulprocolpase_probes_PC <- (featureNames(palmieri_final_PC) %in% rownames(exprSet))
palmieri_final_PC = subset(palmieri_final_PC, ids_fr_mulprocolpase_probes_PC)
palmieri_final_PC
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 22165 features, 18 samples
## element names: exprs
## protocolData
## rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
## DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
## varLabels: exprs dates
## varMetadata: labelDescription channel
## phenoData
## rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
## DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
## varLabels: index Sample.ID Cancer.status Sample.ID_for_dendrogram
## varMetadata: labelDescription channel
## featureData
## featureNames: TC0100006432.hg.1 TC0100006437.hg.1 ...
## TSUnmapped00000826.hg.1 (22165 total)
## fvarLabels: PROBEID ID SYMBOL GENENAME
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: pd.clariom.d.human
Finally, the ExpressionSet palmieri_final_PC ended up with 22165 genes/features.
Now, we perform the differential gene expression analysis based on the pre-filtered palmieri_final_PC ExpressionSet.
Biobase::pData(palmieri_final_PC)$Sample.ID_for_DGE = c('10','10','12','12','13','13','14','14','15_2','15_2','17','17','7','7','8','8','9','9')
individual_PC = as.character(Biobase::pData(palmieri_final_PC)$Sample.ID_for_DGE) #label the tissues with their corresponding patient numbers
tumor = Biobase::pData(palmieri_final_PC)$Cancer.status #group_list
tumor = factor(tumor)
individual_PC = factor(individual_PC)
design_patasvar_PC <- model.matrix(~0+tumor+individual_PC) # design matrix
design_patasvar_PC <- design_patasvar_PC[ , c (2,1,3:10)] # re-arrange the columns
colnames(design_patasvar_PC)[1:2] <- c("T", "B")
rownames(design_patasvar_PC) <- individual_PC
head(design_patasvar_PC)
## T B individual_PC12 individual_PC13 individual_PC14 individual_PC15_2
## 10 1 0 0 0 0 0
## 10 0 1 0 0 0 0
## 12 1 0 1 0 0 0
## 12 0 1 1 0 0 0
## 13 1 0 0 1 0 0
## 13 0 1 0 1 0 0
## individual_PC17 individual_PC7 individual_PC8 individual_PC9
## 10 0 0 0 0
## 10 0 0 0 0
## 12 0 0 0 0
## 12 0 0 0 0
## 13 0 0 0 0
## 13 0 0 0 0
contrast_matrix_PC <- makeContrasts( T-B, levels = design_patasvar_PC) #contrast matrix
contrast_matrix_PC
## Contrasts
## Levels T - B
## T 1
## B -1
## individual_PC12 0
## individual_PC13 0
## individual_PC14 0
## individual_PC15_2 0
## individual_PC17 0
## individual_PC7 0
## individual_PC8 0
## individual_PC9 0
palmieri_fit_PC <- eBayes(contrasts.fit(lmFit(palmieri_final_PC,
design = design_patasvar_PC),
contrast_matrix_PC))
# results <- decideTests (palmieri_fit_PC)
# summary (results)
table_PC <- topTable(palmieri_fit_PC, number = Inf, adjust.method = "BH") #results
#head(table_PC)
table_PC[1:6,5:10]
## logFC AveExpr t P.Value adj.P.Val B
## TC0X00007559.hg.1 2.132570 5.149454 9.058751 1.711672e-06 0.03793921 4.789762
## TC0600014215.hg.1 2.289623 4.568959 8.000315 5.797001e-06 0.05197291 3.882451
## TC0400007511.hg.1 2.649395 5.968684 7.841290 7.034456e-06 0.05197291 3.733359
## TC1100006671.hg.1 2.820101 4.477998 7.440521 1.160157e-05 0.06428721 3.341640
## TC1700012093.hg.1 1.783100 5.668067 7.186389 1.608970e-05 0.07132565 3.080948
## TC0700010910.hg.1 1.550022 6.122438 6.720457 2.991761e-05 0.08223906 2.577001
table(decideTests(palmieri_fit_PC, adjust.method = "BH", p.value = 0.1, lfc = log2(1.5))) # table of gene classes; '-1' = downregulated, '1' = upregulated, '0' = not
##
## -1 0 1
## 2 22116 47
table_PC = na.omit(table_PC) #! This is the result we want
write.csv(table_PC,"limma_table_PC_results.csv",quote = F) # export results after DGE analysis for ALL genes
The adj.P.Val from the topTable results is the Benjamini-Hochberg adjusted p-value. It is calculated by setting the parameter adjust.method to BH in function toptable.
As a diagnostic check, we also plot the raw p-value histogram: We expect a uniform distribution for the p-values that correspond to true null hypotheses, while a peak near zero shows an enrichment for low p-values corresponding to differentially expressed (DE) genes.
hist(table_PC$P.Value, col = brewer.pal(3, name = "Set2")[3],
main = "Tumor vs Normal prostate tissuses", xlab = "p-values")
Here, we take gene TC0X00007559.hg.1 with gene symbol GJB1 as an example because it is ranked first from the DGE result table by having the lowest adjusted p-values.
tumor_for_single_gene = Biobase::pData(palmieri_final_PC)$Cancer.status
#tissue_CD <- tissue[disease == "CD"]
expr_PC <- Biobase::exprs(palmieri_final_PC)["TC0X00007559.hg.1", ]
data_PC <- as.data.frame(expr_PC)
colnames(data_PC)[1] <- "org_value"
data_PC <- mutate(data_PC, individual = individual_PC, tumor_for_single_gene) #'tumor_for_single_gene' = 'tumor'
data_PC$Cancer_type <- factor(data_PC$tumor_for_single_gene, levels = c("B", "T")) #factorise
ggplot(data = data_PC, aes(x = Cancer_type, y = org_value,
group = individual, color = individual)) +
geom_line() +
ggtitle("Expression changes for the GJB1 gene")
# crat_coef_PC <- lmFit(palmieri_final_PC,
# design = design_patasvar_PC)$coefficients["TC0X00007559.hg.1",]
# crat_coef_PC
#
# crat_fitted_PC <- design_patasvar_PC %*% crat_coef_PC
# rownames(crat_fitted_PC) <- names(crat_expr_PC)
# colnames(crat_fitted_PC) <- "fitted_value"
#
# crat_fitted_PC
#
# crat_data_PC$fitted_value <- crat_fitted_PC
#
# ggplot(data = crat_data_PC, aes(x = Cancer_type, y = fitted_value,
# group = individual, color = individual)) +
# geom_line() +
# ggtitle("Fitted expression changes for the GJB1 gene")
#
#
benign_PC <- na.exclude(data_PC$org_value[tumor_for_single_gene == "B"])
malignant_PC <- na.exclude(data_PC$org_value[tumor_for_single_gene == "T"])
res_t_PC <- t.test(benign_PC ,malignant_PC , paired = TRUE)
res_t_PC
##
## Paired t-test
##
## data: benign_PC and malignant_PC
## t = -8.9681, df = 8, p-value = 1.902e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.680927 -1.584213
## sample estimates:
## mean of the differences
## -2.13257
SampleGroup = pData(palmieri_final_PC)$Cancer.status
boxplot(exprs(palmieri_final_PC)["TC0X00007559.hg.1",]~SampleGroup , main="TC0X00007559.hg.1:GJB1",ylab="log2-transformed intensity")
The paired t-test on the gene gives a low p-value close to 0 and thus can conclude that the GJB1 gene is differentially expressed between benign and malignant tumors. We can also confirm this by checking from the expression changes plots and the boxplot.
We plot the volcano plot based on setting the cutoff of fold change to 1.5 and Benjamini-Hochberg corrected p-value to 5% and 10%, respectively.
DEG=table_PC
logFC_cutoff <- log2(1.5) #LogFC Threshold
DEG$change = as.factor(ifelse(DEG$adj.P.Val < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT') #logFC>logFC_cutoff+adj.P<0.05->UP, if not then if abs(lofFC)>cutoff+adj.P<0.05->down, else->NOT;
)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
this_tile
## [1] "Cutoff for logFC is 0.585\nThe number of up gene is 1\nThe number of down gene is 0"
#head(DEG)
g = ggplot(data=DEG, aes(x=logFC, y=-log10(adj.P.Val), color=change)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 FDR corrected p-value") +
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
print(g)
DEG=table_PC
logFC_cutoff <- log2(1.5) #LogFC Threshold
DEG$change = as.factor(ifelse(DEG$adj.P.Val < 0.1 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
#logFC>logFC_cutoff+adj.P<0.1->UP, if not then if abs(lofFC)>cutoff+adj.P<0.1->down, else->NOT;
)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
this_tile
## [1] "Cutoff for logFC is 0.585\nThe number of up gene is 47\nThe number of down gene is 2"
#head(DEG)
g = ggplot(data=DEG, aes(x=logFC, y=-log10(adj.P.Val), color=change)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 FDR corrected p-value") +
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
print(g)
We plot a heatmap for the top 50 genes based on the topTable results from DGE analysis.
Top50 = topTable(palmieri_fit_PC,number=50,adjust="BH", genelist = fData(palmieri_final_PC)$SYMBOL) #?palmieri_fit_PC$genes$SYMBOL???????用哪一个
head(Top50)
## ID logFC AveExpr t P.Value adj.P.Val
## TC0X00007559.hg.1 GJB1 2.132570 5.149454 9.058751 1.711672e-06 0.03793921
## TC0600014215.hg.1 GMDS 2.289623 4.568959 8.000315 5.797001e-06 0.05197291
## TC0400007511.hg.1 DANCR 2.649395 5.968684 7.841290 7.034456e-06 0.05197291
## TC1100006671.hg.1 OR51F2 2.820101 4.477998 7.440521 1.160157e-05 0.06428721
## TC1700012093.hg.1 FASN 1.783100 5.668067 7.186389 1.608970e-05 0.07132565
## TC0700010910.hg.1 POLD2 1.550022 6.122438 6.720457 2.991761e-05 0.08223906
## B
## TC0X00007559.hg.1 4.789762
## TC0600014215.hg.1 3.882451
## TC0400007511.hg.1 3.733359
## TC1100006671.hg.1 3.341640
## TC1700012093.hg.1 3.080948
## TC0700010910.hg.1 2.577001
str(Top50)
## 'data.frame': 50 obs. of 7 variables:
## $ ID : chr "GJB1" "GMDS" "DANCR" "OR51F2" ...
## $ logFC : num 2.13 2.29 2.65 2.82 1.78 ...
## $ AveExpr : num 5.15 4.57 5.97 4.48 5.67 ...
## $ t : num 9.06 8 7.84 7.44 7.19 ...
## $ P.Value : num 1.71e-06 5.80e-06 7.03e-06 1.16e-05 1.61e-05 ...
## $ adj.P.Val: num 0.0379 0.052 0.052 0.0643 0.0713 ...
## $ B : num 4.79 3.88 3.73 3.34 3.08 ...
## visualize
X = exprs(palmieri_final_PC)
Xtop = X[rownames(Top50),]
rownames(Xtop) = Top50$ID
colnames(Xtop) <- c("10A1_T.CEL",
"10E5_B.CEL",
"12A1_T.CEL",
"12E5_B.CEL",
"13A1_T.CEL",
"13E5_B.CEL",
"14A1_T.CEL",
"14E5_B.CEL",
"15A1_T_2.CEL",
"15D4_B_2.CEL",
"17A1_B.CEL",
"17B2_T.CEL",
"7A1_B.CEL",
"7K11_T.CEL",
"8A1_B.CEL",
"8G7_T.CEL",
"904_B.CEL",
"9A1_T.CEL")
heatmap(Xtop, Colv = NA, margins = c(7,2))
gene_for_up_down_set_PC <- bitr(unique(DEG$SYMBOL), fromType = "SYMBOL",
toType = c( "ENTREZID"),
OrgDb = org.Hs.eg.db) #24199 in total
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(unique(DEG$SYMBOL), fromType = "SYMBOL", toType =
## c("ENTREZID"), : 10.01% of input gene IDs are fail to map...
head(gene_for_up_down_set_PC)
## SYMBOL ENTREZID
## 1 GJB1 2705
## 2 GMDS 2762
## 3 DANCR 57291
## 4 OR51F2 119694
## 5 FASN 2194
## 6 POLD2 5425
dim(gene_for_up_down_set_PC)
## [1] 19949 2
head(DEG)
## PROBEID ID SYMBOL
## TC0X00007559.hg.1 TC0X00007559.hg.1 NM_000166 GJB1
## TC0600014215.hg.1 TC0600014215.hg.1 NM_001253846 GMDS
## TC0400007511.hg.1 TC0400007511.hg.1 NR_024031 DANCR
## TC1100006671.hg.1 TC1100006671.hg.1 NM_001004753 OR51F2
## TC1700012093.hg.1 TC1700012093.hg.1 NM_004104 FASN
## TC0700010910.hg.1 TC0700010910.hg.1 NM_001127218 POLD2
## GENENAME
## TC0X00007559.hg.1 gap junction protein beta 1
## TC0600014215.hg.1 GDP-mannose 4,6-dehydratase
## TC0400007511.hg.1 differentiation antagonizing non-protein coding RNA
## TC1100006671.hg.1 olfactory receptor, family 51, subfamily F, member 2
## TC1700012093.hg.1 fatty acid synthase
## TC0700010910.hg.1 polymerase (DNA directed), delta 2, accessory subunit
## logFC AveExpr t P.Value adj.P.Val B
## TC0X00007559.hg.1 2.132570 5.149454 9.058751 1.711672e-06 0.03793921 4.789762
## TC0600014215.hg.1 2.289623 4.568959 8.000315 5.797001e-06 0.05197291 3.882451
## TC0400007511.hg.1 2.649395 5.968684 7.841290 7.034456e-06 0.05197291 3.733359
## TC1100006671.hg.1 2.820101 4.477998 7.440521 1.160157e-05 0.06428721 3.341640
## TC1700012093.hg.1 1.783100 5.668067 7.186389 1.608970e-05 0.07132565 3.080948
## TC0700010910.hg.1 1.550022 6.122438 6.720457 2.991761e-05 0.08223906 2.577001
## change
## TC0X00007559.hg.1 UP
## TC0600014215.hg.1 UP
## TC0400007511.hg.1 UP
## TC1100006671.hg.1 UP
## TC1700012093.hg.1 UP
## TC0700010910.hg.1 UP
dim(DEG)
## [1] 22165 11
DEG_for_up_down_set_PC=merge(DEG,gene_for_up_down_set_PC,by.y='SYMBOL',by.x='SYMBOL') #merge symbol with entrezID
head(DEG_for_up_down_set_PC)
## SYMBOL PROBEID ID
## 1 A1BG-AS1 TC1900009040.hg.1 NR_015380
## 2 A2M TC1200009847.hg.1 NM_000014
## 3 A2M-AS1 TC1200006741.hg.1 NR_026971
## 4 A2ML1 TC1200006736.hg.1 NM_001282424
## 5 A2ML1-AS1 TC1200009835.hg.1 ENST00000537288.1
## 6 A2ML1-AS2 TC1200009839.hg.1 ENST00000394240.3
## GENENAME logFC
## 1 A1BG antisense RNA 1 0.36963760
## 2 alpha-2-macroglobulin -0.18015821
## 3 A2M antisense RNA 1 (head to head) -0.14851336
## 4 alpha-2-macroglobulin-like 1 -0.06491690
## 5 A2ML1 antisense RNA 1 [Source:HGNC Symbol;Acc:HGNC:41022] -0.37139844
## 6 A2ML1 antisense RNA 2 [Source:HGNC Symbol;Acc:HGNC:41523] -0.06509982
## AveExpr t P.Value adj.P.Val B change ENTREZID
## 1 3.303458 1.4971312 0.1619753 0.4672892 -4.987661 NOT 503538
## 2 6.912405 -0.3465907 0.7353172 0.8861146 -5.970505 NOT 2
## 3 3.437194 -0.8063836 0.4367948 0.7061497 -5.707618 NOT 144571
## 4 2.406198 -0.4227723 0.6804573 0.8575844 -5.940807 NOT 144568
## 5 2.454145 -1.7828923 0.1016728 0.3874791 -4.604878 NOT 100874108
## 6 5.953749 -0.4441798 0.6653651 0.8493792 -5.931437 NOT 106478979
dim(DEG_for_up_down_set_PC)
## [1] 19949 12
Now we have our input gene set object DEG_for_up_down_set_PC and will perform the GSEA
#BiocManager::install("clusterProfiler")
library(clusterProfiler)
#BiocManager::install("enrichplot")
library(enrichplot)
# SET THE DESIRED ORGANISM HERE
organism_PC = "org.Hs.eg.db"
#BiocManager::install(organism_PC, character.only = TRUE)
library(organism_PC, character.only = TRUE)
### Prepare Input:
df_GO = DEG_for_up_down_set_PC
# we want the log2 fold change
original_geneList_PC <- df_GO$logFC #THIS IS THE ORIGINAL INPUT OF GENE LIST
# name the vector
names(original_geneList_PC) <- df_GO$SYMBOL
# omit any NA values
geneList_GO_PC <-na.omit(original_geneList_PC) #其实没有NA了,之前全都过滤掉了
# sort the list in decreasing order (required for clusterProfiler); final geneset list
geneList_GO_PC = sort(geneList_GO_PC, decreasing = TRUE)
###Gene Set Enrichment(GO):
gse_GO_PC<- gseGO(geneList=geneList_GO_PC,
ont ="ALL",
keyType = "SYMBOL",
nPerm = 10000, #10000,
minGSSize = 3,
maxGSSize = 800,
pvalueCutoff = 0.05,
verbose = TRUE,
OrgDb = org.Hs.eg.db,
pAdjustMethod = "BH")
## Warning in .GSEA(geneList = geneList, exponent = exponent, minGSSize =
## minGSSize, : We do not recommend using nPerm parameter incurrent and future
## releases
## Warning in fgsea(pathways = geneSets, stats = geneList, nperm = nPerm, minSize
## = minGSSize, : You are trying to run fgseaSimple. It is recommended to use
## fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument
## in the fgsea function call.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.33% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Warning in fgseaSimple(...): There were 7 pathways for which P-values were not
## calculated properly due to unbalanced gene-level statistic values
head(gse_GO_PC)[,1:10]
## ONTOLOGY ID Description setSize
## GO:0000139 CC GO:0000139 Golgi membrane 685
## GO:0000151 CC GO:0000151 ubiquitin ligase complex 253
## GO:0000209 BP GO:0000209 protein polyubiquitination 288
## GO:0000226 BP GO:0000226 microtubule cytoskeleton organization 483
## GO:0000228 CC GO:0000228 nuclear chromosome 474
## GO:0000280 BP GO:0000280 nuclear division 324
## enrichmentScore NES pvalue p.adjust qvalues rank
## GO:0000139 0.4915928 1.447689 9.999e-05 0.002085725 0.001717074 6571
## GO:0000151 0.4969580 1.436450 9.999e-05 0.002085725 0.001717074 7335
## GO:0000209 0.5318977 1.541338 9.999e-05 0.002085725 0.001717074 5412
## GO:0000226 0.4536137 1.328601 9.999e-05 0.002085725 0.001717074 6165
## GO:0000228 0.5253868 1.538213 9.999e-05 0.002085725 0.001717074 6177
## GO:0000280 0.4695175 1.363480 9.999e-05 0.002085725 0.001717074 6430
The GSEA results are shown above in the object gse_GO_PC
###Output/visualization of the GSEA results:
require(DOSE)
dotplot(gse_GO_PC, showCategory=10, split=".sign") + facet_grid(.~.sign)
emapplot(gse_GO_PC,showCategory = 10)
# categorySize can be either 'pvalue' or 'geneNum'
# cnetplot(gse_GO_Islet, categorySize="pvalue", foldChange=geneList_GO_Islet, showCategory = 3)
cnetplot(gse_GO_PC, categorySize="pvalue", foldChange=geneList_GO_PC,showCategory = 3)
cnetplot(gse_GO_PC, node_label = "category",showCategory = 3)
# Use the `Gene Set` param for the index in the title, and as the value for geneSetId
# gseaplot(gse_GO_Islet, by = "all", title = gse_GO_Islet$Description[1], geneSetID = 1)
gseaplot(gse_GO_PC, by = "all", title = gse_GO_PC$Description[1], geneSetID = 1)