Data pre-processing

ExpressionSet Construction

Our (Affymetrix) microarray analysis starts with raw CEL files. CEL files are the results of the processing of the raw image files using the Affymetrix software and contain estimated probe intensity values. We first import the CEL files stored in the folder from the working directory. We use a function read.celfiles from the oligo package to import our raw CEL files into R by using the following codes:

CELs_PC <- list.celfiles("/Users/zhuofanmou/All documents/UOE/PhD work/my project/raw_CEL_analysis/workspace", full.names = TRUE) # change this path to where the CEL files are stored 
raw_data_PC <- oligo::read.celfiles(CELs_PC) # read in the CEL files
stopifnot(validObject(raw_data_PC))

The ExpressionSet named raw_data_PC is now created, then we need to specify our prostate sample information as phenoData in the ExpressionSet. According to the names of the CEL files (i.e., colnames(exprs(raw_data_PC))), we create the phenoData which contains the columns of our interest (i.e., sample ID and tumor status). The information is shown below:

Biobase::pData(raw_data_PC)$Sample.ID <- 
  c('10A1','10E5','12A1','12E5','13A1','13E5','14A1','14E5','15A1_2','15A1','15D4_2','15D4','17A1','17B2','7A1','7K11','8A1','8G7','904','9A1') # adding patients label
Biobase::pData(raw_data_PC)$Cancer.status <- 
  c('T','B','T','B','T','B','T','B','T','T','B','B','B','T','B','T','B','T','B','T') # adding Cancer status
head(Biobase::pData(raw_data_PC))

##                                                  index Sample.ID Cancer.status
## DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL      1      10A1             T
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL      2      10E5             B
## DC_UKB_LH_12A1_T_Sample_2_(Clariom_D_Human).CEL      3      12A1             T
## DC_UKB_LH_12E5_B_Sample_1_(Clariom_D_Human).CEL      4      12E5             B
## DC_UKB_LH_13A1_T_Sample_20_(Clariom_D_Human).CEL     5      13A1             T
## DC_UKB_LH_13E5_B_Sample_19_(Clariom_D_Human).CEL     6      13E5             B

The major components of our ExpressionSet are now completed. We can use the function exprs(raw_data_PC) and pData(raw_data_PC) to access the expression matrix and phenoData respectively. The heads of the results are shown below:

head(exprs(raw_data_PC))

##   DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
## 1                                            3896
## 2                                             328
## 3                                            4157
## 4                                             256
## 5                                             112
## 6                                             193
##   DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL
## 1                                            3260
## 2                                             112
## 3                                            3455
## 4                                             118
## 5                                             133
## 6                                             133
##   DC_UKB_LH_12A1_T_Sample_2_(Clariom_D_Human).CEL
## 1                                            4821
## 2                                             159
## 3                                            4571
## 4                                             147
## 5                                             102
## 6                                             237
##   DC_UKB_LH_12E5_B_Sample_1_(Clariom_D_Human).CEL
## 1                                            4224
## 2                                              84
## 3                                            4050
## 4                                              67
## 5                                              88
## 6                                             244
##   DC_UKB_LH_13A1_T_Sample_20_(Clariom_D_Human).CEL
## 1                                             3416
## 2                                              176
## 3                                             3530
## 4                                              134
## 5                                              158
## 6                                              175
##   DC_UKB_LH_13E5_B_Sample_19_(Clariom_D_Human).CEL
## 1                                             3339
## 2                                               84
## 3                                             3137
## 4                                               97
## 5                                              102
## 6                                              209
##   DC_UKB_LH_14A1_T_Sample_18_(Clariom_D_Human).CEL
## 1                                             3717
## 2                                              126
## 3                                             3881
## 4                                               95
## 5                                              105
## 6                                              224
##   DC_UKB_LH_14E5_B_Sample_17_(Clariom_D_Human).CEL
## 1                                             2421
## 2                                               77
## 3                                             3162
## 4                                               91
## 5                                              107
## 6                                              132
##   DC_UKB_LH_15A1_T_2_Sample_14_(Clariom_D_Human).CEL
## 1                                               4938
## 2                                                111
## 3                                               4613
## 4                                                108
## 5                                                111
## 6                                                155
##   DC_UKB_LH_15A1_T_Sample_16_(Clariom_D_Human).CEL
## 1                                             3043
## 2                                               88
## 3                                             3240
## 4                                               91
## 5                                               78
## 6                                              130
##   DC_UKB_LH_15D4_B_2_Sample_13_(Clariom_D_Human).CEL
## 1                                               3442
## 2                                                104
## 3                                               2886
## 4                                                 98
## 5                                                144
## 6                                                130
##   DC_UKB_LH_15D4_B_Sample_15_(Clariom_D_Human).CEL
## 1                                             3561
## 2                                              102
## 3                                             3333
## 4                                               87
## 5                                              139
## 6                                              196
##   DC_UKB_LH_17A1_B_Sample_11_(Clariom_D_Human).CEL
## 1                                             2934
## 2                                              284
## 3                                             3882
## 4                                              241
## 5                                              195
## 6                                               78
##   DC_UKB_LH_17B2_T_Sample_12_(Clariom_D_Human).CEL
## 1                                             4458
## 2                                              130
## 3                                             4099
## 4                                               80
## 5                                              141
## 6                                              104
##   DC_UKB_LH_7A1_B_Sample_9_(Clariom_D_Human).CEL
## 1                                           2741
## 2                                            106
## 3                                           3196
## 4                                             87
## 5                                            141
## 6                                            100
##   DC_UKB_LH_7K11_T_Sample_10_(Clariom_D_Human).CEL
## 1                                             5500
## 2                                              110
## 3                                             5242
## 4                                              240
## 5                                              176
## 6                                              211
##   DC_UKB_LH_8A1_B_Sample_7_(Clariom_D_Human).CEL
## 1                                           5150
## 2                                            303
## 3                                           4874
## 4                                            180
## 5                                            112
## 6                                             98
##   DC_UKB_LH_8G7_T_Sample_8_(Clariom_D_Human)_2.CEL
## 1                                             3899
## 2                                               91
## 3                                             3606
## 4                                               95
## 5                                               85
## 6                                              179
##   DC_UKB_LH_904_B_Sample_5_(Clariom_D_Human).CEL
## 1                                           4286
## 2                                            130
## 3                                           3923
## 4                                             82
## 5                                            129
## 6                                            140
##   DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL
## 1                                             2448
## 2                                              236
## 3                                             3113
## 4                                              166
## 5                                              115
## 6                                               81

head(pData(raw_data_PC))

##                                                  index Sample.ID Cancer.status
## DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL      1      10A1             T
## DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL      2      10E5             B
## DC_UKB_LH_12A1_T_Sample_2_(Clariom_D_Human).CEL      3      12A1             T
## DC_UKB_LH_12E5_B_Sample_1_(Clariom_D_Human).CEL      4      12E5             B
## DC_UKB_LH_13A1_T_Sample_20_(Clariom_D_Human).CEL     5      13A1             T
## DC_UKB_LH_13E5_B_Sample_19_(Clariom_D_Human).CEL     6      13E5             B

Note that there is one patient labelled with 15 which has two sample replicates, so we remove the first replicate (i.e., 15A1 and 15D4) so that we keep only one patient for every paired malignant/benign samples.

raw_data_15rem_PC = raw_data_PC[ , -c(10,12)]
raw_data_PC = raw_data_15rem_PC

Finally, the ExpressionSet raw_data_PC, apart from the missing of the featureData which we will specify it when we perform the probeset annotation later, is ready to be used for our following analysis.

raw_data_PC

## HTAFeatureSet (storageMode: lockedEnvironment)
## assayData: 6892960 features, 18 samples 
##   element names: exprs 
## protocolData
##   rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
##     DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
##     DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
##   varLabels: exprs dates
##   varMetadata: labelDescription channel
## phenoData
##   rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
##     DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
##     DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
##   varLabels: index Sample.ID Cancer.status
##   varMetadata: labelDescription channel
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: pd.clariom.d.human

Quality check for the raw data vs RMA normalized data

In microarray and other high-throughput technologies, it is important to assess the quality of raw data, also known as quality control. It helps us to check the existence of any sample outliers, to see if the data are clustered as expected (based on the tumor status: T=malignant/tumor or B=benign/normal) and most importantly whether or not the data need to be normalized so that the data from different samples are comparable. We can obtain the expression matrix stored in the ExpressionSet by calling exprs(raw_data_PC). The rows of the expression matrix represent the microarray probes, while the columns represent sample arrays of paired malignant/benign tumor status of every patient, respectively (i.e., 18 columns in total as we have 9 patients).

Generally, there are many sources of noise in microarray experiments:

different amounts of RNA used for labeling and hybridization
imperfections on the array surface
imperfect synthesis of the probes
differences in hybridization conditions

Systematic differences between the samples that are due to noise rather than true biological variability should be removed in order to make biologically meaningful conclusions about the data. Thus, a proper normalisation method is usually required.

In this report, we will apply Robust multichip average (RMA) algorithm to our raw data. The RMA convolution model can be integrated as the following procedures:

model based background correcting/adjustment
log2 transformation and quantile normalisation
robust summary method (median polish summarization)

Note that we can summarize the probes on the levels of known transcripts (by setting target = core in the function rma from package oligo).

We first take the log2 of Biobase::exprs(raw_data), as expression data is more convenient to be analyzed on a logarithmic scale.

exp_raw_PC <- log2(Biobase::exprs(raw_data_PC)) # log2 transform

We also perform the series of steps of RMA algorithm by calling the function rma from oligo. Then the RMA calibrated ExpressionSet is named palmieri_eset_norm_PC. Note that here we retain all probes available (i.e., no pre-filtered) for normalisation.

palmieri_eset_norm_PC <- oligo::rma(raw_data_PC, target = "core") # RMA on the raw data

## Background correcting
## Normalizing
## Calculating Expression

palmieri_eset_norm_PC

## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 138745 features, 18 samples 
##   element names: exprs 
## protocolData
##   rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
##     DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
##     DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
##   varLabels: exprs dates
##   varMetadata: labelDescription channel
## phenoData
##   rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
##     DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
##     DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
##   varLabels: index Sample.ID Cancer.status
##   varMetadata: labelDescription channel
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: pd.clariom.d.human

We can see that there are overall 138745 features summarised in the ExpressionSet palmieri_eset_norm_PC and that means we have 138745 rows of transcript clusters/genes in the expression data exprs(palmieri_eset_norm_PC) .

Now, we will assess the data quality by using various diagnostic plots (e.g., boxplots, MA plots and density plots) for both the log2-transformed raw data and the RMA calibrated data.

MA plot

One of the most used quality assessments (on the raw data) is the pairwise MA plots. It has log-ratio, M, on the horizontal axis, versus average log-intensities, A, on the vertical axis. In other words, if we have two samples \(i\) and \(j\) together with \(I_{i}\) and \(I_{j}\) representing the gene expression values in sample \(i\) and \(j\), respectively, then based on \(log_{2}\) scale, we have: \[M_{i,j}=log_{2}(I_{i})-log_{2}(I_{j})=log_{2}(I_{i}/I_{j})\]

difference in \(log_{2}\) expression or \(log_{2}\)-ratio between 2 samples

\[A_{i,j}= \frac{1}{2}[log_{2}(I_{i})+log_{2}(I_{j})]=log_{2}(\sqrt{I_{i} \times I_{j}})\]

average log2 expression between two samples.

Now, we produce MA plots for four randomly picked samples for the raw data (left) and RMA calibrated data(right)

# For raw data
affy::mva.pairs(pm(raw_data_PC)[,c(4,8,12,18)],plot.method="normal",  main = "MAplot for Prostate raw data ") # Note that we only consider the perfect matched probes 

# For RMA data
exprs_RMA_PC = exprs(palmieri_eset_norm_PC)
affy::mva.pairs(exprs_RMA_PC[,c(4,8,12,18)],plot.method="normal",  main = "MAplot for prostate data after RMA ")

Ideally, the cloud of data points in the MA-plot should be centered around M=0 (blue line). This is because we assume that the majority of the genes is not DE and that the number of upregulated genes is similar to the number of down-regulated genes. Additionally, the variability of the M values should be similar across different array-array combinations. As we can see from left that the spread of the point cloud increases with the average intensity: the loess curve (the fitted red line) for the genes in each pairwise plot moves further and further away (or up and down) from M=0 when A increases, which indicate a normalization is needed to remove the dependency. Then the RMA algorithm was applied to normalise the data and the results are shown in the right. It now shows a much more symmetric and even spread of the data points indicating that the dependence of the variability on the expression level is not as strong as it was before normalization.

Density plot

One quality control check is to plot the distribution of log base 2 intensities of perfect match probes for comparison of probe intensity behavior between different arrays (\(log_{2}(PM_{ij})\) for array i and probe j). For the raw data, there are clear differences in shape of the sample distributions (see left), indicating that a proper normalization is required. And after the we apply the RMA normalisation, the shapes of the sample distributions are now very similar (see right), meaning that the data from each sample are now adjusted and comparable.

# For raw data
pmexp_PC = pm(raw_data_PC); #or can use raw_data_Islet; Note that we only consider the perfect matched probes 
sampleNames_PC = vector()
logs_PC = vector()
for (i in 1:18)
{
  sampleNames_PC = c(sampleNames_PC,rep(as.character(pData(raw_data_PC)[i,2]),dim(pmexp_PC)[1])) #Note: here sample names in 'pData(raw_data_Islet)[i,2]' must be character: 'as.character(pData(raw_data_Islet)[i,2])'
  logs_PC = c(logs_PC,log2(pmexp_PC[,i]))
}

logData_PC = data.frame(logInt=logs_PC,sampleName=sampleNames_PC)

dataHist2_PC = ggplot(logData_PC, aes(logInt, colour = sampleName)) 
dataHist2_PC + geom_density()+ ggtitle("Density plot of the log2-intensitites raw data")

# For RMA data
exp_palmieri_PC <- Biobase::exprs(palmieri_eset_norm_PC)
exp_RMA_PC = exp_palmieri_PC  
sampleNames_RMA_PC = vector()
logs_RMA_PC = vector()
for (i in 1:18)
{
  sampleNames_RMA_PC = c(sampleNames_RMA_PC,rep(as.character(pData(palmieri_eset_norm_PC)[i,2]),dim(exp_RMA_PC)[1])) #Note: here sample names in 'pData(raw_data_Islet)[i,2]' must be character: 'as.character(pData(raw_data_Islet)[i,2])'
  logs_RMA_PC = c(logs_RMA_PC,exp_RMA_PC[,i])
}

logData_RMA_PC = data.frame(logInt=logs_RMA_PC,sampleName=sampleNames_RMA_PC)

dataHist2_RMA_PC = ggplot(logData_RMA_PC, aes(logInt, colour = sampleName)) 
dataHist2_RMA_PC + geom_density() + ggtitle("Density plot of the RMA calibrated prostate data")

Boxplot plot

Both boxplots and density plots show the same differences in probe intensity behavior between sample arrays. In order to perform meaningful statistical analysis and inferences from the data, we need to ensure that all the samples are comparable. To examine and compare the overall distribution of log transformed perfect match intensities between the samples we can use a density plot, but we can get a clearer view with a box plot as it shows:

the median: center value, half of the intensities are lower than this value, half of the intensities are higher (= line in the box)
the upper quartile: a quarter of the values are higher than this quartile (= upper border of the box)
the lower quartile: a quarter of the values are lower than this quartile (= lower border of the box)
the range: minimum and maximum value (= borders of the whiskers)

We note that by using boxplot function from oligo, we do not need to previoulsy log2 transform the expression data. Instead, the function uses ExpressionSet as input directly.

#For raw data
oligo::boxplot(raw_data_PC, target = "core", col = FALSE,
               main = "Boxplot of log2-intensitites for Prostate raw data", xlab = "Sample Number", ylab = "Log2-intensitites", names = pData(raw_data_PC)[ , 2])

# For RMA data

oligo::boxplot(palmieri_eset_norm_PC, target = "core", col = FALSE,
               main = "Boxplot of RMA calibrated prostate data", xlab = "Sample Number", ylab = "Log2-intensitites", names = pData(palmieri_eset_norm_PC)[ , 2])

As we can see from above that after normalization, none of the samples stand out from the rest. The different arrays should now have a very comparable median expression level. Also, the scale of the boxes should be very comparable indicating that the spread of the intensity values on the different arrays is equalized.

Clustering

Now, to check whether the overall variability of the samples reflects their grouping, we can perform a Principal Component Analysis (PCA) and hierarchical clustering dendrogram. In other words, we can check if the benign tumor (B) samples are homogenous and distinguishable from samples of the other malignant tumor (T) group.

PCA plot

# For raw data
PCA_raw_PC <- prcomp(t(exp_raw_PC), scale. = FALSE) 

percentVar_PC <- round(100*PCA_raw_PC$sdev^2/sum(PCA_raw_PC$sdev^2),1)
sd_ratio_PC <- sqrt(percentVar_PC[2] / percentVar_PC[1])

dataGG_PC <- data.frame(PC1 = PCA_raw_PC$x[,1], PC2 = PCA_raw_PC$x[,2],
                           Tumor_status = pData(raw_data_PC)$Cancer.status)

ggplot(dataGG_PC, aes(PC1, PC2)) +
  geom_point(aes(colour = Tumor_status)) +
  geom_text(aes(label =pData(raw_data_PC)[ , 2] ), hjust = -0.1, vjust = -0.3) + # add point names
  ggtitle("PCA plot of the log-transformed raw expression data") +
  xlab(paste0("PC1, VarExp: ", percentVar_PC[1], "%")) +
  xlim(-1000,2000)+
  ylim(-1000,1000)+ 
  ylab(paste0("PC2, VarExp: ", percentVar_PC[2], "%")) +
  theme(plot.title = element_text(hjust = 0.5))+
  coord_fixed(ratio = sd_ratio_PC) +
  scale_color_manual(values = c("#666666", "#3399FF"))


# For RMA data
exp_palmieri_PC <- Biobase::exprs(palmieri_eset_norm_PC)

PCA_RMA_PC <- prcomp(t(exp_palmieri_PC), scale = FALSE)

percentVar_RMA_PC <- round(100*PCA_RMA_PC$sdev^2/sum(PCA_RMA_PC$sdev^2),1)
sd_ratio_RMA_PC <- sqrt(percentVar_RMA_PC[2] / percentVar_RMA_PC[1])

dataGG_RMA_PC <- data.frame(PC1 = PCA_RMA_PC$x[,1], PC2 = PCA_RMA_PC$x[,2],
                               Tumor_status =  Biobase::pData(palmieri_eset_norm_PC)$Cancer.status)

ggplot(dataGG_RMA_PC, aes(PC1, PC2)) +
  geom_point(aes(colour = Tumor_status)) +
  geom_text(aes(label =pData(palmieri_eset_norm_PC)[ , 2] ), hjust = -0.1, vjust = -0.3) + # add point names
  ggtitle("PCA plot of the RMA calibrated, summarized data") +
  xlab(paste0("PC1, VarExp: ", percentVar_RMA_PC[1], "%")) +
  xlim(-200,400)+
  ylim(-200,200)+
  ylab(paste0("PC2, VarExp: ", percentVar_RMA_PC[2], "%")) +
  theme(plot.title = element_text(hjust = 0.5)) +
  coord_fixed(ratio = sd_ratio_RMA_PC) +
  scale_color_manual(values = c("#666666", "#3399FF"))

Dendrogram

We also plot the dendrogram to show the clustering of the sample after RMA calibrated (ideally, we want the cancer status labelled with B and T are clustered seperatly) (Q: Do we usually plot sample-to-sample dendro for paired dataset ot is it worthless?)

Biobase::pData(palmieri_eset_norm_PC)$Sample.ID_for_dendrogram = c('10A1_T','10E5_B','12A1_T','12E5_B','13A1_T','13E5_B','14A1_T','14E5_B','15A1_2_T','15D4_2_B','17A1_B','17B2_T','7A1_B','7K11_T','8A1_B','8G7_T','904_B','9A1_T')#c('10A1_T','10E5_B','12A1_T','12E5_B','13A1_T','13E5_B','14A1_T','14E5_B','15A1_2_T','15D4_2_B','17A1_B','17B2_T','7A1_B','7K11_T','8A1_B','8G7_T','904_B','9A1_T')
clustering <- exp_palmieri_PC %>% t() %>% dist(method = "euclidean") %>% 
  hclust(method = "complete")
plot(clustering, labels = (pData(palmieri_eset_norm_PC)$Sample.ID_for_dendrogram),main = "Cluster dendrogram of 18 samples (Euclidean)")

The PCA plots indicates that the PC1 differentiates between the tumor types, although the data points are still noisy.

The dendrogram confirms that benign tumor samples such as 7A1, 904,12E5 and 14E5 are misclustered with those of malignant tumor samples.

(Q: Any possible reasons?: Patients 7&12 are under specific treatments, patients 9&14 are the only active smokers among the patients?)

Annotation of the transcript clusters

Now, we add ‘featureData’, i.e. annotation information of the transcript cluster identifiers stored in the ExpressionSet.

There are two ways of annotating:

using the annotation that is assembled by the BioC core team using data from public repositories, either on the levels of probe set regions ‘clariomdhumanprobeset.db’ or transcripts ‘clariomdhumantranscriptcluster.db’. By doing this way, there are overall 24406 out of 138745 rows of the probset/genes actually annotated.
using the annotation provided by Affymetrix (as available on their NetAffx site), which is contained in the Platform Design (pd) package ‘pd.clariom.d.human’. By doing this way, there are overall 86161 out of 138745 rows of the probset/genes actually annotated.

The second approach will be used as it gives ~60% of the total transcript clusters annotated.

In order to annotate the probeset, we will use the package affycoretools and apply the function annotateEset; the feature information is then automatically stored to the ‘featureData’ of the ExpressionSet palmieri_eset_norm_PC

library(affycoretools)
palmieri_eset_norm_PC <- annotateEset(palmieri_eset_norm_PC, pd.clariom.d.human) # annotating the features with annotation package 'pd.clariom.d.human'

#palmieri_eset_norm_PC <- annotateEset(palmieri_eset_norm_PC, clariomdhumantranscriptcluster.db) # this line of code is for the First approach

Now, to access the featureData, we can use

head(fData(palmieri_eset_norm_PC))

##                             PROBEID   ID SYMBOL GENENAME
## AFFX-BkGr-GC03_st AFFX-BkGr-GC03_st <NA>   <NA>     <NA>
## AFFX-BkGr-GC04_st AFFX-BkGr-GC04_st <NA>   <NA>     <NA>
## AFFX-BkGr-GC05_st AFFX-BkGr-GC05_st <NA>   <NA>     <NA>
## AFFX-BkGr-GC06_st AFFX-BkGr-GC06_st <NA>   <NA>     <NA>
## AFFX-BkGr-GC07_st AFFX-BkGr-GC07_st <NA>   <NA>     <NA>
## AFFX-BkGr-GC08_st AFFX-BkGr-GC08_st <NA>   <NA>     <NA>

However, as we can see above there are many rows of probesets which are not annotated (i.e., NAs).

To see how many rows of transcrip clusters are truly annotated, we can use the following code

length(which(!is.na(fData(palmieri_eset_norm_PC)$SYMBOL)))

## [1] 86161

There are 86161 out of 138745 rows of the probset/genes annotated.

Gene filtering/reduction prior to further analysis

Now, we perform the following filtering steps priori to futher analysis such as differential gene expression analysis:

filtering out lowly expressed genes/transcripts and keep those which have intensities larger that the threshold in at least 50% samples (i.e.,9)
removing probes without a specific gene identifier (i.e.,gene sysmbol) (Q: by EntrezID?)
~~removing the probes that have multiple gene mappings~~ (there is no multiple mapping probe when using the annotation package ‘pd.clariom.d.human’)
removing probes/transcripts identified by library AceView
removing probes identified as pseudogenes
removing probes without a specific genename (i.e.,‘- - -’)
collapsing multiple probes into one for the same gene by taking the one with the maximum row-wise mean across all samples

Filtering based on intensity

Now, one of the strategies we can use in order to reduce the number of gene input is to filter out lowly expressed genes. Microarray data commonly show a large number of probes in the background intensity range. These probes also do not change much across arrays. Hence they combine a low variance with a low intensity. Thus, they could end up being detected as differentially expressed although they are barely above the “detection” limit and are not very informative in general.

For intensity-based filtering, we calculate the row-wise means/averages from the expression data, as they represent the transcript means, and assign them to palmieri_means_PC. We also apply the function shorth deom the genefilter package in order to decide the expression cutoff value. The shorth is the shortest interval that covers half of the values in x (i.e., the mean vector for each probe across the whole samples). This function calculates the mean of the x values that lie in the shorth. This was proposed by Andrews (1972) as a robust estimator of location.

palmieri_means_PC <- rowMeans(Biobase::exprs(palmieri_eset_norm_PC))

hist_res_PC <- hist(palmieri_means_PC, 100, col = "azure3", freq = FALSE, 
                       main = "Histogram of the mean intensities", 
                       border = "antiquewhite4",
                       xlab = "Mean intensities")



sh_cutoff <- shorth(palmieri_means_PC) #https://support.bioconductor.org/p/69467/#69744); shorth cutoff: 2.089

# histgram with abline
man_threshold_PC <- sh_cutoff  #!NEED DISCUSSSION WITH LORNA

hist_res_PC <- hist(palmieri_means_PC, 100, col = "azure3", freq = FALSE, 
                       main = "Histogram of the mean intensities with cutoff line",
                       border = "antiquewhite4",
                       xlab = "Mean intensities")

abline(v = man_threshold_PC, col = "coral4", lwd = 2)


filtered.set <- palmieri_eset_norm_PC[palmieri_means_PC >=sh_cutoff,] # number of probes passed the shorth cutoff
dim(filtered.set)

## Features  Samples 
##    98548       18

no_of_samples_PC <- table(paste0(pData(palmieri_eset_norm_PC)$Cancer.status)) # number of samples
no_of_samples_PC

## 
## B T 
## 9 9

samples_cutoff_PC <- min(no_of_samples_PC) # select the group with minimum number of samples (9 in our case) 

idx_man_threshold_PC <- apply(Biobase::exprs(palmieri_eset_norm_PC), 1,function(x){sum(x > man_threshold_PC) >=samples_cutoff_PC})
table(idx_man_threshold_PC) # number of probes that passed the short cutoff and in at least 50% samples

## idx_man_threshold_PC
## FALSE  TRUE 
## 42733 96012

palmieri_manfiltered_PC <- subset(palmieri_eset_norm_PC, idx_man_threshold_PC) # filter out the probes do not meet the criteria

The distribution of the histograms above shows an enrichment of low means on the left-hand side, and these represent lowly expressed genes which we want to filter out. Q: is the cutoff using shorth ok?

Removing multiple mappings

Many transcript-cluster identifiers will map to multiple gene symbols, i.e. they can’t be unambigously assigned.

We first get rid of the unannotated transcript clusters

anno_palmieri_PC <- fData(palmieri_manfiltered_PC)
anno_palmieri_PC <- subset(anno_palmieri_PC, !is.na(SYMBOL))
dim(anno_palmieri_PC)

## [1] 64292     4

anno_palmieri_PC[1:6,1:4]

##                             PROBEID                        ID     SYMBOL
## TC0100006432.hg.1 TC0100006432.hg.1                 NR_046018    DDX11L1
## TC0100006433.hg.1 TC0100006433.hg.1 spopoybu.aAug10-unspliced   spopoybu
## TC0100006434.hg.1 TC0100006434.hg.1                 NR_036267 MIR1302-10
## TC0100006436.hg.1 TC0100006436.hg.1           ENST00000492842     OR4G2P
## TC0100006437.hg.1 TC0100006437.hg.1              NM_001005484      OR4F5
## TC0100006438.hg.1 TC0100006438.hg.1           ENST00000442987     CICP27
##                                                                                              GENENAME
## TC0100006432.hg.1                                 DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1
## TC0100006433.hg.1                                                    Transcript Identified by AceView
## TC0100006434.hg.1                                                                    microRNA 1302-10
## TC0100006436.hg.1                      olfactory receptor, family 4, subfamily G, member 2 pseudogene
## TC0100006437.hg.1                                 olfactory receptor, family 4, subfamily F, member 5
## TC0100006438.hg.1 capicua transcriptional repressor pseudogene 27 [Source:HGNC Symbol;Acc:HGNC:48835]

Then we check how many rows that have the same probeID but mapping to different genes; There is no probesetID/transcript-cluster identifier mapping to different genes when using the annotation package ‘pd.clariom.d.human’ (i.e., each probe maps to one gene but not necessary to be a distinct gene). However, on the other hand, it is surely the case when applying the annotation package ‘clariomdhumantranscriptcluster.db’ .

anno_grouped_PC <- group_by(anno_palmieri_PC, PROBEID) #group by the probesetIDs 
dim(anno_grouped_PC)

## [1] 64292     4

anno_summarized_PC <- 
  dplyr::summarize(anno_grouped_PC, no_of_matches = n_distinct(SYMBOL)) #list out all the distinct IDs and the corresponding frequency
head(anno_summarized_PC)

## # A tibble: 6 x 2
##   PROBEID           no_of_matches
##   <chr>                     <int>
## 1 TC0100006432.hg.1             1
## 2 TC0100006433.hg.1             1
## 3 TC0100006434.hg.1             1
## 4 TC0100006436.hg.1             1
## 5 TC0100006437.hg.1             1
## 6 TC0100006438.hg.1             1

anno_filtered_PC <- filter(anno_summarized_PC, no_of_matches > 1) # filter for the IDs that have mapping greater than 1
head(anno_filtered_PC)

## # A tibble: 0 x 2
## # … with 2 variables: PROBEID <chr>, no_of_matches <int>

probe_stats_PC <- anno_filtered_PC #With dim(probe_stats), we can see how many probes have been mapped to multiple genes.
nrow(probe_stats_PC)

## [1] 0

ids_to_exlude_PC <- (featureNames(palmieri_manfiltered_PC) %in% probe_stats_PC$PROBEID) # matching the probes that need to be removed
table(ids_to_exlude_PC)

## ids_to_exlude_PC
## FALSE 
## 96012

palmieri_final_PC <- subset(palmieri_manfiltered_PC, !ids_to_exlude_PC) #filter out the probes having multiple mappings; and creat the final ExpressionSet
validObject(palmieri_final_PC)

## [1] TRUE

head(anno_palmieri_PC)

##                             PROBEID                        ID     SYMBOL
## TC0100006432.hg.1 TC0100006432.hg.1                 NR_046018    DDX11L1
## TC0100006433.hg.1 TC0100006433.hg.1 spopoybu.aAug10-unspliced   spopoybu
## TC0100006434.hg.1 TC0100006434.hg.1                 NR_036267 MIR1302-10
## TC0100006436.hg.1 TC0100006436.hg.1           ENST00000492842     OR4G2P
## TC0100006437.hg.1 TC0100006437.hg.1              NM_001005484      OR4F5
## TC0100006438.hg.1 TC0100006438.hg.1           ENST00000442987     CICP27
##                                                                                              GENENAME
## TC0100006432.hg.1                                 DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1
## TC0100006433.hg.1                                                    Transcript Identified by AceView
## TC0100006434.hg.1                                                                    microRNA 1302-10
## TC0100006436.hg.1                      olfactory receptor, family 4, subfamily G, member 2 pseudogene
## TC0100006437.hg.1                                 olfactory receptor, family 4, subfamily F, member 5
## TC0100006438.hg.1 capicua transcriptional repressor pseudogene 27 [Source:HGNC Symbol;Acc:HGNC:48835]

fData(palmieri_final_PC)$PROBEID <- rownames(fData(palmieri_final_PC))
fData(palmieri_final_PC) <- left_join(fData(palmieri_final_PC), anno_palmieri_PC)
rownames(fData(palmieri_final_PC)) <- fData(palmieri_final_PC)$PROBEID 
validObject(palmieri_final_PC)

## [1] TRUE

length(which(!is.na(fData(palmieri_final_PC)$SYMBOL)))  # check for number of truly annotated probesets

## [1] 64292

boxplot(palmieri_final_PC)

As nrow(probe_stats_PC) gives 0,which means that each of the transcrip-cluster identifiers annotated maps to only one gene (N.B., it does not mean that we are safe for the case where we have multiple transcrip-cluster identifiers that mapping to a same gene; and we will deal with this later).

Removing genes without a specific gene SYMBOL

f_filbyNA_PC <- na.omit(fData(palmieri_final_PC)) #filter out the NA rows
ids_to_exlude_NA_PC <- (featureNames(palmieri_final_PC) %in% f_filbyNA_PC$PROBEID) #match the feature names from ExpressionSet to the annotated probesetIDs that are left; it is a logical object
palmieri_final_PC = subset(palmieri_final_PC, ids_to_exlude_NA_PC)  # subset the ExpressionSet for which includes the features that are only truly annotated

palmieri_final_PC

## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 64292 features, 18 samples 
##   element names: exprs 
## protocolData
##   rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
##     DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
##     DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
##   varLabels: exprs dates
##   varMetadata: labelDescription channel
## phenoData
##   rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
##     DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
##     DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
##   varLabels: index Sample.ID Cancer.status Sample.ID_for_dendrogram
##   varMetadata: labelDescription channel
## featureData
##   featureNames: TC0100006432.hg.1 TC0100006433.hg.1 ...
##     TSUnmapped00000826.hg.1 (64292 total)
##   fvarLabels: PROBEID ID SYMBOL GENENAME
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: pd.clariom.d.human

Now, we have the ExpressionSet palmieri_final_PC which contains only those truly annotated features/transcript-cluster identifiers/genes/probeset IDs/rows.

Removing genes identified by AceView

We filter out the genes/transcripts that are identified by the gene library AceView because nothing is known about these transcripts and some of them are only predicted so are not particularly useful.

df_Aceview = fData(palmieri_final_PC)
no_of_AceView = df_Aceview[grepl("AceView", df_Aceview$GENENAME),]
head(no_of_AceView)

##                             PROBEID                        ID   SYMBOL
## TC0100006433.hg.1 TC0100006433.hg.1 spopoybu.aAug10-unspliced spopoybu
## TC0100006443.hg.1 TC0100006443.hg.1  storubu.aAug10-unspliced  storubu
## TC0100006446.hg.1 TC0100006446.hg.1  boyrabu.aAug10-unspliced  boyrabu
## TC0100006462.hg.1 TC0100006462.hg.1            jerfobu.aAug10  jerfobu
## TC0100006477.hg.1 TC0100006477.hg.1 sarmeybo.aAug10-unspliced sarmeybo
## TC0100006478.hg.1 TC0100006478.hg.1  turorbu.aAug10-unspliced  turorbu
##                                           GENENAME
## TC0100006433.hg.1 Transcript Identified by AceView
## TC0100006443.hg.1 Transcript Identified by AceView
## TC0100006446.hg.1 Transcript Identified by AceView
## TC0100006462.hg.1 Transcript Identified by AceView
## TC0100006477.hg.1 Transcript Identified by AceView
## TC0100006478.hg.1 Transcript Identified by AceView

length(no_of_AceView$PROBEID)

## [1] 20849

f_filbyAceView_PC = df_Aceview[!grepl("AceView", df_Aceview$GENENAME),]
ids_to_match_AceView_PC <- (featureNames(palmieri_final_PC) %in% f_filbyAceView_PC$PROBEID)
palmieri_final_PC = subset(palmieri_final_PC, ids_to_match_AceView_PC)
palmieri_final_PC

## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 43443 features, 18 samples 
##   element names: exprs 
## protocolData
##   rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
##     DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
##     DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
##   varLabels: exprs dates
##   varMetadata: labelDescription channel
## phenoData
##   rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
##     DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
##     DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
##   varLabels: index Sample.ID Cancer.status Sample.ID_for_dendrogram
##   varMetadata: labelDescription channel
## featureData
##   featureNames: TC0100006432.hg.1 TC0100006434.hg.1 ...
##     TSUnmapped00000826.hg.1 (43443 total)
##   fvarLabels: PROBEID ID SYMBOL GENENAME
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: pd.clariom.d.human

Removing pseudogene

Next, we also filter out the genes identified as pseudogenes.

df_pseudo = fData(palmieri_final_PC)
no_of_pseudo = df_pseudo[grepl("pseudogene", df_pseudo$GENENAME),]
head(no_of_pseudo)

##                             PROBEID              ID   SYMBOL
## TC0100006436.hg.1 TC0100006436.hg.1 ENST00000492842   OR4G2P
## TC0100006438.hg.1 TC0100006438.hg.1 ENST00000442987   CICP27
## TC0100006450.hg.1 TC0100006450.hg.1 ENST00000437905  WBP1LP7
## TC0100006451.hg.1 TC0100006451.hg.1 ENST00000432723    CICP7
## TC0100006453.hg.1 TC0100006453.hg.1 ENST00000416931 MTND1P23
## TC0100006454.hg.1 TC0100006454.hg.1 ENST00000457540 MTND2P28
##                                                                                                                                  GENENAME
## TC0100006436.hg.1                                                          olfactory receptor, family 4, subfamily G, member 2 pseudogene
## TC0100006438.hg.1                                     capicua transcriptional repressor pseudogene 27 [Source:HGNC Symbol;Acc:HGNC:48835]
## TC0100006450.hg.1                                       WW domain binding protein 1-like pseudogene 7 [Source:HGNC Symbol;Acc:HGNC:43955]
## TC0100006451.hg.1                                      capicua transcriptional repressor pseudogene 7 [Source:HGNC Symbol;Acc:HGNC:37756]
## TC0100006453.hg.1 mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 1 pseudogene 23 [Source:HGNC Symbol;Acc:HGNC:42092]
## TC0100006454.hg.1 mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 2 pseudogene 28 [Source:HGNC Symbol;Acc:HGNC:42129]

length(no_of_pseudo$PROBEID)

## [1] 6959

f_filbypseudogene_PC = df_pseudo[!grepl("pseudogene", df_pseudo$GENENAME),]
ids_to_match_pseudogene_PC <- (featureNames(palmieri_final_PC) %in% f_filbypseudogene_PC$PROBEID)
palmieri_final_PC = subset(palmieri_final_PC, ids_to_match_pseudogene_PC)
palmieri_final_PC

## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 36484 features, 18 samples 
##   element names: exprs 
## protocolData
##   rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
##     DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
##     DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
##   varLabels: exprs dates
##   varMetadata: labelDescription channel
## phenoData
##   rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
##     DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
##     DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
##   varLabels: index Sample.ID Cancer.status Sample.ID_for_dendrogram
##   varMetadata: labelDescription channel
## featureData
##   featureNames: TC0100006432.hg.1 TC0100006434.hg.1 ...
##     TSUnmapped00000826.hg.1 (36484 total)
##   fvarLabels: PROBEID ID SYMBOL GENENAME
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: pd.clariom.d.human

Removing genes without a specific GENENAME

There are some probes having their GENENAME as ‘- - -’, we also remove them. ?

# remove '---' genenames
df_threebar = fData(palmieri_final_PC)
no_of_threebar = df_threebar[grepl("---", df_threebar$GENENAME),]
head(no_of_threebar)

##                             PROBEID                ID        SYMBOL GENENAME
## TC0100006440.hg.1 TC0100006440.hg.1 ENST00000496488.1  RP11-34P13.9      ---
## TC0100006444.hg.1 TC0100006444.hg.1 ENST00000412666.1  RP4-669L17.2      ---
## TC0100006452.hg.1 TC0100006452.hg.1 ENST00000423796.1  RP5-857K21.2      ---
## TC0100006455.hg.1 TC0100006455.hg.1        uc057avw.1    AC114498.1      ---
## TC0100006464.hg.1 TC0100006464.hg.1 ENST00000422528.1 RP11-206L10.4      ---
## TC0100006466.hg.1 TC0100006466.hg.1 ENST00000358533.2 RP11-206L10.9      ---

length(no_of_threebar$PROBEID)

## [1] 11880

f_filbythree_PC = df_threebar[!grepl("---", df_threebar$GENENAME),]
ids_to_match_threebar_PC <- (featureNames(palmieri_final_PC) %in% f_filbythree_PC$PROBEID)
palmieri_final_PC = subset(palmieri_final_PC, ids_to_match_threebar_PC)
palmieri_final_PC

## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 24604 features, 18 samples 
##   element names: exprs 
## protocolData
##   rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
##     DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
##     DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
##   varLabels: exprs dates
##   varMetadata: labelDescription channel
## phenoData
##   rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
##     DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
##     DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
##   varLabels: index Sample.ID Cancer.status Sample.ID_for_dendrogram
##   varMetadata: labelDescription channel
## featureData
##   featureNames: TC0100006432.hg.1 TC0100006434.hg.1 ...
##     TSUnmapped00000826.hg.1 (24604 total)
##   fvarLabels: PROBEID ID SYMBOL GENENAME
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: pd.clariom.d.human

Collapsing multiple transcript-cluster identifiers to one measure for the same genes

It is usually the case that we can have multiple probes/transcripts that are mapping to the same genes, for example

fData = fData(palmieri_final_PC)
n_occur <- data.frame(table(fData$SYMBOL))
n_occur<-n_occur[n_occur$Freq > 1,]
head(n_occur)

##          Var1 Freq
## 1   5_8S_rRNA    3
## 2     5S_rRNA   11
## 3         7SK    3
## 47  ABCA9-AS1    2
## 55      ABCC1    2
## 108       ABO    2

We will collapse the probes into one for the same gene based their (max) mean expression aross teh whole samples.We achieve this by using the following codes:

ids_PC = fData(palmieri_final_PC)[,c(1,3)]  ## only need the probesetIDs (1st column) and gene symbols (3rd column)
ids_PC = ids_PC[ids_PC[,2] != '',]  ## get rid of 'NA' gene symbols 86161
ID2gene = ids_PC
exprSet = exprs(palmieri_final_PC) 


{
  MAX = by(exprSet, ID2gene[,2], 
           function(x) rownames(x)[ which.max(rowMeans(x))])
  MAX = as.character(MAX)    
  exprSet = exprSet[rownames(exprSet) %in% MAX,]        
  #rownames( exprSet ) = ID2gene[ match( rownames( exprSet ), ID2gene[ , 1 ] ), 2 ] # rownames to gene symbol
}
dim(exprSet)

## [1] 22165    18

ids_fr_mulprocolpase_probes_PC <- (featureNames(palmieri_final_PC) %in% rownames(exprSet))
palmieri_final_PC = subset(palmieri_final_PC, ids_fr_mulprocolpase_probes_PC)
palmieri_final_PC

## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 22165 features, 18 samples 
##   element names: exprs 
## protocolData
##   rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
##     DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
##     DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
##   varLabels: exprs dates
##   varMetadata: labelDescription channel
## phenoData
##   rowNames: DC_UKB_LH_10A1_T_Sample_4_(Clariom_D_Human).CEL
##     DC_UKB_LH_10E5_B_Sample_3_(Clariom_D_Human).CEL ...
##     DC_UKB_LH_9A1_T_Sample_6_(Clariom_D_Human)_2.CEL (18 total)
##   varLabels: index Sample.ID Cancer.status Sample.ID_for_dendrogram
##   varMetadata: labelDescription channel
## featureData
##   featureNames: TC0100006432.hg.1 TC0100006437.hg.1 ...
##     TSUnmapped00000826.hg.1 (22165 total)
##   fvarLabels: PROBEID ID SYMBOL GENENAME
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: pd.clariom.d.human

Finally, the ExpressionSet palmieri_final_PC ended up with 22165 genes/features.

Differential Gene Expression Analysis

Now, we perform the differential gene expression analysis based on the pre-filtered palmieri_final_PC ExpressionSet.

Biobase::pData(palmieri_final_PC)$Sample.ID_for_DGE = c('10','10','12','12','13','13','14','14','15_2','15_2','17','17','7','7','8','8','9','9')

individual_PC = as.character(Biobase::pData(palmieri_final_PC)$Sample.ID_for_DGE) #label the tissues with their corresponding patient numbers
tumor = Biobase::pData(palmieri_final_PC)$Cancer.status  #group_list
tumor = factor(tumor)
individual_PC = factor(individual_PC)
design_patasvar_PC <- model.matrix(~0+tumor+individual_PC) # design matrix
design_patasvar_PC <- design_patasvar_PC[ , c (2,1,3:10)] # re-arrange the columns
colnames(design_patasvar_PC)[1:2] <- c("T", "B")
rownames(design_patasvar_PC) <- individual_PC 
head(design_patasvar_PC)

##    T B individual_PC12 individual_PC13 individual_PC14 individual_PC15_2
## 10 1 0               0               0               0                 0
## 10 0 1               0               0               0                 0
## 12 1 0               1               0               0                 0
## 12 0 1               1               0               0                 0
## 13 1 0               0               1               0                 0
## 13 0 1               0               1               0                 0
##    individual_PC17 individual_PC7 individual_PC8 individual_PC9
## 10               0              0              0              0
## 10               0              0              0              0
## 12               0              0              0              0
## 12               0              0              0              0
## 13               0              0              0              0
## 13               0              0              0              0

contrast_matrix_PC <- makeContrasts( T-B, levels = design_patasvar_PC) #contrast matrix
contrast_matrix_PC

##                    Contrasts
## Levels              T - B
##   T                     1
##   B                    -1
##   individual_PC12       0
##   individual_PC13       0
##   individual_PC14       0
##   individual_PC15_2     0
##   individual_PC17       0
##   individual_PC7        0
##   individual_PC8        0
##   individual_PC9        0

palmieri_fit_PC <- eBayes(contrasts.fit(lmFit(palmieri_final_PC,
                                              design = design_patasvar_PC),
                                        contrast_matrix_PC))
 # results <- decideTests (palmieri_fit_PC)
 # summary (results)

table_PC <- topTable(palmieri_fit_PC, number = Inf, adjust.method = "BH")  #results
#head(table_PC)
table_PC[1:6,5:10]

##                      logFC  AveExpr        t      P.Value  adj.P.Val        B
## TC0X00007559.hg.1 2.132570 5.149454 9.058751 1.711672e-06 0.03793921 4.789762
## TC0600014215.hg.1 2.289623 4.568959 8.000315 5.797001e-06 0.05197291 3.882451
## TC0400007511.hg.1 2.649395 5.968684 7.841290 7.034456e-06 0.05197291 3.733359
## TC1100006671.hg.1 2.820101 4.477998 7.440521 1.160157e-05 0.06428721 3.341640
## TC1700012093.hg.1 1.783100 5.668067 7.186389 1.608970e-05 0.07132565 3.080948
## TC0700010910.hg.1 1.550022 6.122438 6.720457 2.991761e-05 0.08223906 2.577001

table(decideTests(palmieri_fit_PC, adjust.method = "BH", p.value = 0.1, lfc = log2(1.5))) # table of gene classes; '-1' = downregulated, '1' = upregulated, '0' = not

## 
##    -1     0     1 
##     2 22116    47

table_PC = na.omit(table_PC)    #！ This is the result we want
write.csv(table_PC,"limma_table_PC_results.csv",quote = F) # export results after DGE analysis for ALL genes

The adj.P.Val from the topTable results is the Benjamini-Hochberg adjusted p-value. It is calculated by setting the parameter adjust.method to BH in function toptable.

Histogram of raw P-values

As a diagnostic check, we also plot the raw p-value histogram: We expect a uniform distribution for the p-values that correspond to true null hypotheses, while a peak near zero shows an enrichment for low p-values corresponding to differentially expressed (DE) genes.

hist(table_PC$P.Value, col = brewer.pal(3, name = "Set2")[3],
     main = "Tumor vs Normal prostate tissuses", xlab = "p-values")

DGE analysis on a single gene

Here, we take gene TC0X00007559.hg.1 with gene symbol GJB1 as an example because it is ranked first from the DGE result table by having the lowest adjusted p-values.

tumor_for_single_gene = Biobase::pData(palmieri_final_PC)$Cancer.status
#tissue_CD <- tissue[disease == "CD"]
expr_PC <- Biobase::exprs(palmieri_final_PC)["TC0X00007559.hg.1", ]
data_PC <- as.data.frame(expr_PC)
colnames(data_PC)[1] <- "org_value"
data_PC <- mutate(data_PC, individual = individual_PC, tumor_for_single_gene) #'tumor_for_single_gene' = 'tumor'

data_PC$Cancer_type <- factor(data_PC$tumor_for_single_gene, levels = c("B", "T")) #factorise

ggplot(data = data_PC, aes(x = Cancer_type, y = org_value, 
                             group = individual, color = individual)) +
  geom_line() +
  ggtitle("Expression changes for the GJB1 gene")


# crat_coef_PC <- lmFit(palmieri_final_PC,
#                    design = design_patasvar_PC)$coefficients["TC0X00007559.hg.1",]
# crat_coef_PC
# 
# crat_fitted_PC <- design_patasvar_PC %*% crat_coef_PC
# rownames(crat_fitted_PC) <- names(crat_expr_PC)
# colnames(crat_fitted_PC) <- "fitted_value"
# 
# crat_fitted_PC
# 
# crat_data_PC$fitted_value <- crat_fitted_PC
# 
# ggplot(data = crat_data_PC, aes(x = Cancer_type, y = fitted_value, 
#                              group = individual, color = individual)) +
#   geom_line() +
#   ggtitle("Fitted expression changes for the GJB1 gene")
# 
# 
benign_PC <- na.exclude(data_PC$org_value[tumor_for_single_gene == "B"])
malignant_PC <- na.exclude(data_PC$org_value[tumor_for_single_gene == "T"])
res_t_PC <- t.test(benign_PC ,malignant_PC , paired = TRUE)
res_t_PC

## 
##  Paired t-test
## 
## data:  benign_PC and malignant_PC
## t = -8.9681, df = 8, p-value = 1.902e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.680927 -1.584213
## sample estimates:
## mean of the differences 
##                -2.13257

SampleGroup = pData(palmieri_final_PC)$Cancer.status
boxplot(exprs(palmieri_final_PC)["TC0X00007559.hg.1",]~SampleGroup , main="TC0X00007559.hg.1:GJB1",ylab="log2-transformed intensity")

The paired t-test on the gene gives a low p-value close to 0 and thus can conclude that the GJB1 gene is differentially expressed between benign and malignant tumors. We can also confirm this by checking from the expression changes plots and the boxplot.

Volcano plot

We plot the volcano plot based on setting the cutoff of fold change to 1.5 and Benjamini-Hochberg corrected p-value to 5% and 10%, respectively.

DEG=table_PC
logFC_cutoff <- log2(1.5)  #LogFC Threshold
DEG$change = as.factor(ifelse(DEG$adj.P.Val < 0.05 & abs(DEG$logFC) > logFC_cutoff,
                              ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')     #logFC>logFC_cutoff+adj.P<0.05->UP, if not then if abs(lofFC)>cutoff+adj.P<0.05->down, else->NOT;
)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
                    '\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
                    '\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
this_tile

## [1] "Cutoff for logFC is 0.585\nThe number of up gene is 1\nThe number of down gene is 0"

#head(DEG)
g = ggplot(data=DEG, aes(x=logFC, y=-log10(adj.P.Val), color=change)) + 
  geom_point(alpha=0.4, size=1.75) + 
  theme_set(theme_set(theme_bw(base_size=20)))+
  xlab("log2 fold change") + ylab("-log10 FDR corrected p-value") +
  ggtitle( this_tile  ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
  scale_colour_manual(values = c('blue','black','red'))  ## corresponding to the levels(res$change)
print(g)


DEG=table_PC
logFC_cutoff <- log2(1.5)  #LogFC Threshold
DEG$change = as.factor(ifelse(DEG$adj.P.Val < 0.1 & abs(DEG$logFC) > logFC_cutoff,
                              ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')    
#logFC>logFC_cutoff+adj.P<0.1->UP, if not then if abs(lofFC)>cutoff+adj.P<0.1->down, else->NOT;
)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
                    '\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
                    '\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
this_tile

## [1] "Cutoff for logFC is 0.585\nThe number of up gene is 47\nThe number of down gene is 2"

#head(DEG)
g = ggplot(data=DEG, aes(x=logFC, y=-log10(adj.P.Val), color=change)) + 
  geom_point(alpha=0.4, size=1.75) + 
  theme_set(theme_set(theme_bw(base_size=20)))+
  xlab("log2 fold change") + ylab("-log10 FDR corrected p-value") +
  ggtitle( this_tile  ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
  scale_colour_manual(values = c('blue','black','red'))  ## corresponding to the levels(res$change)
print(g)

Heatmap

We plot a heatmap for the top 50 genes based on the topTable results from DGE analysis.

Top50 = topTable(palmieri_fit_PC,number=50,adjust="BH", genelist = fData(palmieri_final_PC)$SYMBOL)  #?palmieri_fit_PC$genes$SYMBOL???????用哪一个
head(Top50)

##                       ID    logFC  AveExpr        t      P.Value  adj.P.Val
## TC0X00007559.hg.1   GJB1 2.132570 5.149454 9.058751 1.711672e-06 0.03793921
## TC0600014215.hg.1   GMDS 2.289623 4.568959 8.000315 5.797001e-06 0.05197291
## TC0400007511.hg.1  DANCR 2.649395 5.968684 7.841290 7.034456e-06 0.05197291
## TC1100006671.hg.1 OR51F2 2.820101 4.477998 7.440521 1.160157e-05 0.06428721
## TC1700012093.hg.1   FASN 1.783100 5.668067 7.186389 1.608970e-05 0.07132565
## TC0700010910.hg.1  POLD2 1.550022 6.122438 6.720457 2.991761e-05 0.08223906
##                          B
## TC0X00007559.hg.1 4.789762
## TC0600014215.hg.1 3.882451
## TC0400007511.hg.1 3.733359
## TC1100006671.hg.1 3.341640
## TC1700012093.hg.1 3.080948
## TC0700010910.hg.1 2.577001

str(Top50)

## 'data.frame':    50 obs. of  7 variables:
##  $ ID       : chr  "GJB1" "GMDS" "DANCR" "OR51F2" ...
##  $ logFC    : num  2.13 2.29 2.65 2.82 1.78 ...
##  $ AveExpr  : num  5.15 4.57 5.97 4.48 5.67 ...
##  $ t        : num  9.06 8 7.84 7.44 7.19 ...
##  $ P.Value  : num  1.71e-06 5.80e-06 7.03e-06 1.16e-05 1.61e-05 ...
##  $ adj.P.Val: num  0.0379 0.052 0.052 0.0643 0.0713 ...
##  $ B        : num  4.79 3.88 3.73 3.34 3.08 ...

## visualize
X = exprs(palmieri_final_PC)
Xtop = X[rownames(Top50),]
rownames(Xtop) = Top50$ID
colnames(Xtop) <- c("10A1_T.CEL",   
                          "10E5_B.CEL",  
                          "12A1_T.CEL",   
                          "12E5_B.CEL",   
                          "13A1_T.CEL",  
                          "13E5_B.CEL",  
                          "14A1_T.CEL",  
                          "14E5_B.CEL",  
                          "15A1_T_2.CEL",
                          "15D4_B_2.CEL",
                          "17A1_B.CEL",  
                          "17B2_T.CEL",  
                          "7A1_B.CEL",    
                          "7K11_T.CEL",  
                          "8A1_B.CEL",    
                          "8G7_T.CEL",  
                          "904_B.CEL",    
                          "9A1_T.CEL")


heatmap(Xtop, Colv = NA, margins = c(7,2))

GO based Gene Set Enrichment Analysis

Transfer from gene symbol to EntrzID

gene_for_up_down_set_PC <- bitr(unique(DEG$SYMBOL), fromType = "SYMBOL",
                                      toType = c( "ENTREZID"),
                                      OrgDb = org.Hs.eg.db)             #24199 in total

## 'select()' returned 1:many mapping between keys and columns

## Warning in bitr(unique(DEG$SYMBOL), fromType = "SYMBOL", toType =
## c("ENTREZID"), : 10.01% of input gene IDs are fail to map...

head(gene_for_up_down_set_PC)

##   SYMBOL ENTREZID
## 1   GJB1     2705
## 2   GMDS     2762
## 3  DANCR    57291
## 4 OR51F2   119694
## 5   FASN     2194
## 6  POLD2     5425

dim(gene_for_up_down_set_PC)

## [1] 19949     2

head(DEG)

##                             PROBEID           ID SYMBOL
## TC0X00007559.hg.1 TC0X00007559.hg.1    NM_000166   GJB1
## TC0600014215.hg.1 TC0600014215.hg.1 NM_001253846   GMDS
## TC0400007511.hg.1 TC0400007511.hg.1    NR_024031  DANCR
## TC1100006671.hg.1 TC1100006671.hg.1 NM_001004753 OR51F2
## TC1700012093.hg.1 TC1700012093.hg.1    NM_004104   FASN
## TC0700010910.hg.1 TC0700010910.hg.1 NM_001127218  POLD2
##                                                                GENENAME
## TC0X00007559.hg.1                           gap junction protein beta 1
## TC0600014215.hg.1                           GDP-mannose 4,6-dehydratase
## TC0400007511.hg.1   differentiation antagonizing non-protein coding RNA
## TC1100006671.hg.1  olfactory receptor, family 51, subfamily F, member 2
## TC1700012093.hg.1                                   fatty acid synthase
## TC0700010910.hg.1 polymerase (DNA directed), delta 2, accessory subunit
##                      logFC  AveExpr        t      P.Value  adj.P.Val        B
## TC0X00007559.hg.1 2.132570 5.149454 9.058751 1.711672e-06 0.03793921 4.789762
## TC0600014215.hg.1 2.289623 4.568959 8.000315 5.797001e-06 0.05197291 3.882451
## TC0400007511.hg.1 2.649395 5.968684 7.841290 7.034456e-06 0.05197291 3.733359
## TC1100006671.hg.1 2.820101 4.477998 7.440521 1.160157e-05 0.06428721 3.341640
## TC1700012093.hg.1 1.783100 5.668067 7.186389 1.608970e-05 0.07132565 3.080948
## TC0700010910.hg.1 1.550022 6.122438 6.720457 2.991761e-05 0.08223906 2.577001
##                   change
## TC0X00007559.hg.1     UP
## TC0600014215.hg.1     UP
## TC0400007511.hg.1     UP
## TC1100006671.hg.1     UP
## TC1700012093.hg.1     UP
## TC0700010910.hg.1     UP

dim(DEG)

## [1] 22165    11

Merge gene symbol with entrezID

DEG_for_up_down_set_PC=merge(DEG,gene_for_up_down_set_PC,by.y='SYMBOL',by.x='SYMBOL') #merge symbol with entrezID
head(DEG_for_up_down_set_PC)

##      SYMBOL           PROBEID                ID
## 1  A1BG-AS1 TC1900009040.hg.1         NR_015380
## 2       A2M TC1200009847.hg.1         NM_000014
## 3   A2M-AS1 TC1200006741.hg.1         NR_026971
## 4     A2ML1 TC1200006736.hg.1      NM_001282424
## 5 A2ML1-AS1 TC1200009835.hg.1 ENST00000537288.1
## 6 A2ML1-AS2 TC1200009839.hg.1 ENST00000394240.3
##                                                    GENENAME       logFC
## 1                                      A1BG antisense RNA 1  0.36963760
## 2                                     alpha-2-macroglobulin -0.18015821
## 3                        A2M antisense RNA 1 (head to head) -0.14851336
## 4                              alpha-2-macroglobulin-like 1 -0.06491690
## 5 A2ML1 antisense RNA 1 [Source:HGNC Symbol;Acc:HGNC:41022] -0.37139844
## 6 A2ML1 antisense RNA 2 [Source:HGNC Symbol;Acc:HGNC:41523] -0.06509982
##    AveExpr          t   P.Value adj.P.Val         B change  ENTREZID
## 1 3.303458  1.4971312 0.1619753 0.4672892 -4.987661    NOT    503538
## 2 6.912405 -0.3465907 0.7353172 0.8861146 -5.970505    NOT         2
## 3 3.437194 -0.8063836 0.4367948 0.7061497 -5.707618    NOT    144571
## 4 2.406198 -0.4227723 0.6804573 0.8575844 -5.940807    NOT    144568
## 5 2.454145 -1.7828923 0.1016728 0.3874791 -4.604878    NOT 100874108
## 6 5.953749 -0.4441798 0.6653651 0.8493792 -5.931437    NOT 106478979

dim(DEG_for_up_down_set_PC)

## [1] 19949    12

Go GSEA analysis

Now we have our input gene set object DEG_for_up_down_set_PC and will perform the GSEA

#BiocManager::install("clusterProfiler")
library(clusterProfiler)
#BiocManager::install("enrichplot")
library(enrichplot)
# SET THE DESIRED ORGANISM HERE
organism_PC = "org.Hs.eg.db"
#BiocManager::install(organism_PC, character.only = TRUE)
library(organism_PC, character.only = TRUE)

### Prepare Input:
df_GO = DEG_for_up_down_set_PC
# we want the log2 fold change 
original_geneList_PC <- df_GO$logFC    #THIS IS THE ORIGINAL INPUT OF GENE LIST
# name the vector
names(original_geneList_PC) <- df_GO$SYMBOL
# omit any NA values 
geneList_GO_PC <-na.omit(original_geneList_PC)  #其实没有NA了，之前全都过滤掉了
# sort the list in decreasing order (required for clusterProfiler); final geneset list
geneList_GO_PC = sort(geneList_GO_PC, decreasing = TRUE)

###Gene Set Enrichment(GO):

gse_GO_PC<- gseGO(geneList=geneList_GO_PC, 
                     ont ="ALL", 
                     keyType = "SYMBOL", 
                     nPerm = 10000, #10000, 
                     minGSSize = 3, 
                     maxGSSize = 800, 
                     pvalueCutoff = 0.05, 
                     verbose = TRUE, 
                     OrgDb = org.Hs.eg.db,       
                     pAdjustMethod = "BH")

## Warning in .GSEA(geneList = geneList, exponent = exponent, minGSSize =
## minGSSize, : We do not recommend using nPerm parameter incurrent and future
## releases

## Warning in fgsea(pathways = geneSets, stats = geneList, nperm = nPerm, minSize
## = minGSSize, : You are trying to run fgseaSimple. It is recommended to use
## fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument
## in the fgsea function call.

## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.33% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.

## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.

## Warning in fgseaSimple(...): There were 7 pathways for which P-values were not
## calculated properly due to unbalanced gene-level statistic values

head(gse_GO_PC)[,1:10]

##            ONTOLOGY         ID                           Description setSize
## GO:0000139       CC GO:0000139                        Golgi membrane     685
## GO:0000151       CC GO:0000151              ubiquitin ligase complex     253
## GO:0000209       BP GO:0000209            protein polyubiquitination     288
## GO:0000226       BP GO:0000226 microtubule cytoskeleton organization     483
## GO:0000228       CC GO:0000228                    nuclear chromosome     474
## GO:0000280       BP GO:0000280                      nuclear division     324
##            enrichmentScore      NES    pvalue    p.adjust     qvalues rank
## GO:0000139       0.4915928 1.447689 9.999e-05 0.002085725 0.001717074 6571
## GO:0000151       0.4969580 1.436450 9.999e-05 0.002085725 0.001717074 7335
## GO:0000209       0.5318977 1.541338 9.999e-05 0.002085725 0.001717074 5412
## GO:0000226       0.4536137 1.328601 9.999e-05 0.002085725 0.001717074 6165
## GO:0000228       0.5253868 1.538213 9.999e-05 0.002085725 0.001717074 6177
## GO:0000280       0.4695175 1.363480 9.999e-05 0.002085725 0.001717074 6430

The GSEA results are shown above in the object gse_GO_PC

###Output/visualization of the GSEA results:

Dotplot

require(DOSE)
dotplot(gse_GO_PC, showCategory=10, split=".sign") + facet_grid(.~.sign)

Encrichment Map:

emapplot(gse_GO_PC,showCategory = 10)

Category Netplot:

# categorySize can be either 'pvalue' or 'geneNum'
# cnetplot(gse_GO_Islet, categorySize="pvalue", foldChange=geneList_GO_Islet, showCategory = 3)
cnetplot(gse_GO_PC, categorySize="pvalue", foldChange=geneList_GO_PC,showCategory = 3)
cnetplot(gse_GO_PC, node_label = "category",showCategory = 3)

GSEA Plot:

# Use the `Gene Set` param for the index in the title, and as the value for geneSetId
# gseaplot(gse_GO_Islet, by = "all", title = gse_GO_Islet$Description[1], geneSetID = 1)

gseaplot(gse_GO_PC, by = "all", title = gse_GO_PC$Description[1], geneSetID = 1)

Prostate Cancer Data Analysis

Zhuofan Mou

10/6/2020

Setting up working directory

List of packages needed