###Loading Required Libraries
#General Bioconductor packages
library(Biobase)
library(oligoClasses)
#Annotation and data import packages
library(ArrayExpress)
library(hgu133plus2.db)
#Quality control and pre-processing packages
library(oligo)
library(arrayQualityMetrics)
#Analysis and statistics packages
library(limma)
library(topGO)
library(ReactomePA)
library(clusterProfiler)
#Plotting and color options packages
library(gplots)
library(ggplot2)
library(geneplotter)
library(RColorBrewer)
library(pheatmap)
#Formatting/documentation packages
library(dplyr)
library(tidyr)
#Helpers:
library(stringr)
library(matrixStats)
library(genefilter)
library(openxlsx)
library(devtools)
#===========================================
#~~ Downloading and Importing the Data Set
raw_data_dir="C:\\Users\\Hamda\\Desktop\\University\\2nd Year\\Fall Semester\\Genomics and Data Mining\\R Workplace\\EMTAB_Arterial_Disease"
#anno_AE <- getAE("E-GEOD-27034", path = raw_data_dir, type = "full")
#===========================================
# ~~ Data Set Format Explanation:
# The data Sets in ArrayExpress are stored according to MAGE-TAB: Micro Array Gene Expression Tabular.
# The MAGE-TAB consists of:
# 1. Investigation Description Format (IDF)
# This file contains all the important details about the experiment as title, description, submitter, and protocols.
# 2. Array Design Format (ADF)
# This file contains all the important data about the microarray chip used.
# 3. Sample and Data Relationship Format (SDRF)
# This file contains all the essential information on the samples: ex what group(s) they belong to
# 4. raw data files
# 5. processed data files
#===========================================
# ~~Importing the SDRF: Sample & Data Relationship File
# 1. Select SDRF location:
sdrf_location <- file.path(raw_data_dir, "E-GEOD-27034.sdrf.txt")
# 2. use read.delim function to read from the folder
SDRF <- read.delim(sdrf_location)
#===========================================
# ~~Introducing the Structure of ExpressionSet:
# The Expression set combines several different sources of information from the MAGE-TAB into single a convenient structure. From the ExpressionSet we can extract the important information as it can be subsetted, copied, etc.
# Name Row Column
# 1. AssayData: Probes Intensities Sample ID's
# 2. PhenoData: Sample IDs (CEL #) Description ex Disease-Healthy
# To check: SDRF@data or raw_data@phenoData@data
# 3. featureData: Probes Intensities Features of Chips
# To check: raw_data@featureData@data
# 4. Experiment Data : Flexible data to describe the Experiment.
# 5. Raw Data & Processed Data (Distinctively)
#===========================================
# ~~Extracting Sample annotation into SDRF Annotated Data Frame:
# The Rownames in SDRF@data must contain the Samples names.
# Extract the Samples names from SDRF$Array.Data.File and assign them to rownames(SDRF)
rownames(SDRF) <- SDRF$Array.Data.File
# Turn the SDRF table into an Annotated Data Frame
SDRF <- AnnotatedDataFrame(SDRF)
# Creating the Set raw_data which contains:
# Array data, phenoData & infromation of the chip Annotation package used
# Each CEL files contain the Probe Intensity Values & some metadata
raw_data <- oligo::read.celfiles(filenames = file.path(raw_data_dir,
SDRF$Array.Data.File),
verbose = FALSE, phenoData = SDRF)
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902358.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902357.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902356.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902355.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902354.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902353.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902352.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902351.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902350.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902349.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902348.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902347.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902346.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902345.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902344.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902343.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902342.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902341.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902340.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902339.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902338.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902337.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902336.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902335.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902334.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902333.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902332.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902331.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902330.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902329.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902328.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902327.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902326.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902325.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902324.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902323.cel
## Reading in : C:\Users\Hamda\Desktop\University\2nd Year\Fall Semester\Genomics and Data Mining\R Workplace\EMTAB_Arterial_Disease/GSM902322.cel
## Warning in oligo::read.celfiles(filenames = file.path(raw_data_dir,
## SDRF$Array.Data.File), : 'channel' automatically added to varMetadata in
## phenoData.
stopifnot(validObject(raw_data))
# When viewing the raw_data we notice that some Colums are not required.
# To View the raw data pheno Data use:
# head(Biobase::pData(raw_data))
# Sub selection of the Columns: Individual - With-Without PAD
Biobase::pData(raw_data) <- Biobase::pData(raw_data)[,c("Source.Name",
"Characteristics.disease.",
"Comment..Sample_source_name.",
"Comment..Sample_title.")]
# Remember: Matrix[,c(columns wanted)]
#===========================================
#===========================================
# ~~ Step 1: Performing BG Correction, Normalization, and Summarization
# This will be done Using my own Normalization function, and the built in rma bg correction and summarization functions.
#a. Background Correction using built in rma function in package oligo
bg_corrected_data <- oligo::rma(raw_data, normalize=FALSE)
## Background correcting
## Calculating Expression
#b.1) Normalization using QN_Final self implemented in Project-1-RMA
source("QN_Final.R")
calData_QN_byMedian <- quantile_normalization(Biobase::exprs(bg_corrected_data))
#b.2) Normalization using built in rma function in package oligo
calData_by_rma <- oligo::rma(raw_data)
## Background correcting
## Normalizing
## Calculating Expression
#~~ Assessing the Difference between QN and the Built in Function in Oligo:
# Visually from the summaries we can tell that both functions result in extremely similar outputs:
summary(calData_QN_byMedian)
## GSM902358.cel GSM902357.cel GSM902356.cel GSM902355.cel
## Min. : 2.044 Min. : 2.044 Min. : 2.044 Min. : 2.044
## 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688
## Median : 5.169 Median : 5.169 Median : 5.169 Median : 5.169
## Mean : 5.646 Mean : 5.646 Mean : 5.646 Mean : 5.646
## 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239
## Max. :14.878 Max. :14.878 Max. :14.878 Max. :14.878
## GSM902354.cel GSM902353.cel GSM902352.cel GSM902351.cel
## Min. : 2.044 Min. : 2.044 Min. : 2.044 Min. : 2.044
## 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688
## Median : 5.169 Median : 5.169 Median : 5.169 Median : 5.169
## Mean : 5.646 Mean : 5.646 Mean : 5.646 Mean : 5.646
## 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239
## Max. :14.878 Max. :14.878 Max. :14.878 Max. :14.878
## GSM902350.cel GSM902349.cel GSM902348.cel GSM902347.cel
## Min. : 2.044 Min. : 2.044 Min. : 2.044 Min. : 2.044
## 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.690
## Median : 5.169 Median : 5.169 Median : 5.169 Median : 5.169
## Mean : 5.646 Mean : 5.646 Mean : 5.646 Mean : 5.646
## 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239
## Max. :14.878 Max. :14.878 Max. :14.878 Max. :14.878
## GSM902346.cel GSM902345.cel GSM902344.cel GSM902343.cel
## Min. : 2.044 Min. : 2.044 Min. : 2.044 Min. : 2.044
## 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688
## Median : 5.169 Median : 5.169 Median : 5.169 Median : 5.169
## Mean : 5.646 Mean : 5.646 Mean : 5.646 Mean : 5.646
## 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239
## Max. :14.878 Max. :14.878 Max. :14.878 Max. :14.878
## GSM902342.cel GSM902341.cel GSM902340.cel GSM902339.cel
## Min. : 2.044 Min. : 2.044 Min. : 2.044 Min. : 2.044
## 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688
## Median : 5.169 Median : 5.169 Median : 5.169 Median : 5.169
## Mean : 5.646 Mean : 5.646 Mean : 5.646 Mean : 5.646
## 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239
## Max. :14.878 Max. :14.878 Max. :14.878 Max. :14.878
## GSM902338.cel GSM902337.cel GSM902336.cel GSM902335.cel
## Min. : 2.044 Min. : 2.044 Min. : 2.044 Min. : 2.044
## 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688
## Median : 5.169 Median : 5.169 Median : 5.169 Median : 5.169
## Mean : 5.646 Mean : 5.646 Mean : 5.646 Mean : 5.646
## 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239
## Max. :14.878 Max. :14.878 Max. :14.878 Max. :14.878
## GSM902334.cel GSM902333.cel GSM902332.cel GSM902331.cel
## Min. : 2.044 Min. : 2.044 Min. : 2.044 Min. : 2.044
## 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688
## Median : 5.169 Median : 5.169 Median : 5.169 Median : 5.169
## Mean : 5.646 Mean : 5.646 Mean : 5.646 Mean : 5.646
## 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239
## Max. :14.878 Max. :14.878 Max. :14.878 Max. :14.878
## GSM902330.cel GSM902329.cel GSM902328.cel GSM902327.cel
## Min. : 2.044 Min. : 2.044 Min. : 2.044 Min. : 2.044
## 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688
## Median : 5.169 Median : 5.169 Median : 5.169 Median : 5.169
## Mean : 5.646 Mean : 5.646 Mean : 5.646 Mean : 5.646
## 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239
## Max. :14.878 Max. :14.878 Max. :14.878 Max. :14.878
## GSM902326.cel GSM902325.cel GSM902324.cel GSM902323.cel
## Min. : 2.044 Min. : 2.044 Min. : 2.044 Min. : 2.044
## 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688 1st Qu.: 3.688
## Median : 5.169 Median : 5.169 Median : 5.169 Median : 5.169
## Mean : 5.646 Mean : 5.646 Mean : 5.646 Mean : 5.646
## 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239 3rd Qu.: 7.239
## Max. :14.878 Max. :14.878 Max. :14.878 Max. :14.878
## GSM902322.cel
## Min. : 2.044
## 1st Qu.: 3.688
## Median : 5.169
## Mean : 5.646
## 3rd Qu.: 7.239
## Max. :14.878
summary(Biobase::exprs(calData_by_rma))
## GSM902358.cel GSM902357.cel GSM902356.cel GSM902355.cel
## Min. : 2.504 Min. : 2.405 Min. : 2.462 Min. : 2.432
## 1st Qu.: 3.961 1st Qu.: 3.866 1st Qu.: 3.865 1st Qu.: 3.840
## Median : 5.303 Median : 5.266 Median : 5.268 Median : 5.234
## Mean : 5.861 Mean : 5.878 Mean : 5.875 Mean : 5.870
## 3rd Qu.: 7.338 3rd Qu.: 7.459 3rd Qu.: 7.455 3rd Qu.: 7.473
## Max. :14.927 Max. :14.974 Max. :14.951 Max. :14.917
## GSM902354.cel GSM902353.cel GSM902352.cel GSM902351.cel
## Min. : 2.481 Min. : 2.398 Min. : 2.381 Min. : 2.478
## 1st Qu.: 3.834 1st Qu.: 3.849 1st Qu.: 3.878 1st Qu.: 3.870
## Median : 5.228 Median : 5.256 Median : 5.268 Median : 5.270
## Mean : 5.867 Mean : 5.869 Mean : 5.876 Mean : 5.870
## 3rd Qu.: 7.483 3rd Qu.: 7.461 3rd Qu.: 7.431 3rd Qu.: 7.443
## Max. :14.909 Max. :14.938 Max. :14.944 Max. :14.914
## GSM902350.cel GSM902349.cel GSM902348.cel GSM902347.cel
## Min. : 2.460 Min. : 2.448 Min. : 2.439 Min. : 2.413
## 1st Qu.: 3.880 1st Qu.: 3.932 1st Qu.: 3.909 1st Qu.: 3.910
## Median : 5.268 Median : 5.232 Median : 5.198 Median : 5.220
## Mean : 5.870 Mean : 5.772 Mean : 5.777 Mean : 5.780
## 3rd Qu.: 7.433 3rd Qu.: 7.148 3rd Qu.: 7.215 3rd Qu.: 7.200
## Max. :14.935 Max. :14.871 Max. :14.833 Max. :14.873
## GSM902346.cel GSM902345.cel GSM902344.cel GSM902343.cel
## Min. : 2.429 Min. : 2.456 Min. : 2.438 Min. : 2.450
## 1st Qu.: 3.936 1st Qu.: 3.935 1st Qu.: 3.941 1st Qu.: 3.924
## Median : 5.223 Median : 5.241 Median : 5.225 Median : 5.212
## Mean : 5.773 Mean : 5.773 Mean : 5.762 Mean : 5.771
## 3rd Qu.: 7.158 3rd Qu.: 7.182 3rd Qu.: 7.135 3rd Qu.: 7.172
## Max. :14.857 Max. :14.969 Max. :14.879 Max. :14.895
## GSM902342.cel GSM902341.cel GSM902340.cel GSM902339.cel
## Min. : 2.358 Min. : 2.393 Min. : 2.446 Min. : 2.455
## 1st Qu.: 3.912 1st Qu.: 3.976 1st Qu.: 3.877 1st Qu.: 3.877
## Median : 5.215 Median : 5.242 Median : 5.265 Median : 5.256
## Mean : 5.778 Mean : 5.739 Mean : 5.871 Mean : 5.865
## 3rd Qu.: 7.205 3rd Qu.: 7.042 3rd Qu.: 7.425 3rd Qu.: 7.402
## Max. :14.905 Max. :14.861 Max. :14.951 Max. :14.952
## GSM902338.cel GSM902337.cel GSM902336.cel GSM902335.cel
## Min. : 2.421 Min. : 2.456 Min. : 2.383 Min. : 2.436
## 1st Qu.: 3.866 1st Qu.: 3.877 1st Qu.: 3.871 1st Qu.: 3.878
## Median : 5.246 Median : 5.240 Median : 5.234 Median : 5.223
## Mean : 5.865 Mean : 5.862 Mean : 5.868 Mean : 5.860
## 3rd Qu.: 7.432 3rd Qu.: 7.393 3rd Qu.: 7.430 3rd Qu.: 7.373
## Max. :14.899 Max. :14.924 Max. :14.911 Max. :14.903
## GSM902334.cel GSM902333.cel GSM902332.cel GSM902331.cel
## Min. : 2.483 Min. : 2.440 Min. : 2.470 Min. : 2.435
## 1st Qu.: 3.919 1st Qu.: 3.867 1st Qu.: 3.898 1st Qu.: 3.899
## Median : 5.279 Median : 5.253 Median : 5.257 Median : 5.265
## Mean : 5.865 Mean : 5.865 Mean : 5.863 Mean : 5.855
## 3rd Qu.: 7.328 3rd Qu.: 7.414 3rd Qu.: 7.371 3rd Qu.: 7.347
## Max. :14.938 Max. :14.907 Max. :14.910 Max. :15.070
## GSM902330.cel GSM902329.cel GSM902328.cel GSM902327.cel
## Min. : 2.446 Min. : 2.387 Min. : 2.362 Min. : 2.448
## 1st Qu.: 3.914 1st Qu.: 3.921 1st Qu.: 3.910 1st Qu.: 3.919
## Median : 5.201 Median : 5.194 Median : 5.200 Median : 5.202
## Mean : 5.768 Mean : 5.765 Mean : 5.761 Mean : 5.760
## 3rd Qu.: 7.168 3rd Qu.: 7.141 3rd Qu.: 7.167 3rd Qu.: 7.154
## Max. :14.888 Max. :14.826 Max. :14.884 Max. :14.827
## GSM902326.cel GSM902325.cel GSM902324.cel GSM902323.cel
## Min. : 2.448 Min. : 2.446 Min. : 2.488 Min. : 2.452
## 1st Qu.: 3.927 1st Qu.: 3.903 1st Qu.: 3.914 1st Qu.: 3.926
## Median : 5.221 Median : 5.213 Median : 5.228 Median : 5.228
## Mean : 5.765 Mean : 5.769 Mean : 5.768 Mean : 5.761
## 3rd Qu.: 7.164 3rd Qu.: 7.199 3rd Qu.: 7.186 3rd Qu.: 7.150
## Max. :14.894 Max. :14.929 Max. :14.871 Max. :14.798
## GSM902322.cel
## Min. : 2.403
## 1st Qu.: 3.939
## Median : 5.211
## Mean : 5.750
## 3rd Qu.: 7.123
## Max. :14.576
# The variance between both functions output is also small: order 10^-2
sum((Biobase::exprs(calData_by_rma)-calData_QN_byMedian)^2)/(nrow(calData_QN_byMedian)*37)
## [1] 0.06357698
# This allows us to continue working with the calData_QN, but for the sake of not wasting memory and time to reassemble the Expression Set from the Matrix produced by calData_QN, we will use calData_by_rma
cal_data <- calData_by_rma
#===========================================
#===========================================
# ~~ Step 2: Setting Median To Baseline and Observing the Clusters in HeatMap:
# ~~ Get back to this later...
row_medians_assayData <-
Biobase::rowMedians(as.matrix(Biobase::exprs(cal_data)))
Biobase::exprs(cal_data) <- sweep(Biobase::exprs(cal_data), 1, row_medians_assayData)
#===========================================
#===========================================
# ~~ Checking the Difference between the Current Calibrated Data and the Proccessed Data
processed <- as.matrix(read.table("C:\\Users\\Hamda\\Desktop\\University\\2nd Year\\Fall Semester\\Genomics and Data Mining\\R Workplace\\EMTAB_Arterial_Disease\\GSM902358_sample_table.txt"))
rownames(processed) <- processed[,1]
processed <- processed [-1]
processed <- as.numeric(processed)
## Warning: NAs introduced by coercion
processed <- sort(processed)
pip_data <- Biobase::exprs(cal_data)[,1]
pip_data <- sort(pip_data)
variance <- sum(processed-pip_data)^2/54675
variance
## [1] 0.0004699299
# We can tell that the variance between the processed data and the cal_data (in 1 sample, and i assume that the variance is the same in all the other samples) is very small = order 10^-4
#===========================================
#===========================================
# ~~ Step 3: Filtering out the lower 20% Expressed Probes
row_medians <- rowMedians(as.matrix(Biobase::exprs(calData_by_rma)))
man_threshold <- sort(row_medians)[length(row_medians)*0.2]
{hist_res <- hist(row_medians, 100, col = "cornsilk", freq = FALSE,
main = "Histogram of the median intensities",
border = "antiquewhite4",
xlab = "Medians intensities")
abline(v = man_threshold, col = "coral4", lwd = 2)}
no_of_samples <-
table(paste0(pData(cal_data)$Characteristics.disease.))
no_of_samples
##
## normal peripheral arterial disease
## 18 19
samples_cutoff <- min(no_of_samples)
idx_man_threshold <- apply(Biobase::exprs(calData_by_rma), 1,
function(x){
sum(x > man_threshold) >= samples_cutoff})
table(idx_man_threshold)
## idx_man_threshold
## FALSE TRUE
## 10760 43915
cal_filtered <- subset(cal_data, idx_man_threshold)
#===========================================
#=========================================
# ~~ Checking if the final probe IDS still exist after Filtering
probes <- read.table("C:\\Users\\Hamda\\Desktop\\University\\2nd Year\\Fall Semester\\Genomics and Data Mining\\R Workplace\\Genomics\\bif425_fall20_hamdanieh_bilal\\MicroArray_Project\\final_probes.txt")
probes <- unlist(probes)
probes <- as.character(probes)
Present <- lapply(probes,function(x){
any(rownames(Biobase::exprs(cal_filtered))==x)
})
# Check how many are Present: Must be around 95
length(which(Present==TRUE))
## [1] 96
#=========================================
# ~~ Step 4 a1: Only probes with >1.5 fold change Underwent Wilcox Test
# Calculating the Fold Change:
normal <- Biobase::exprs(cal_filtered)[,1:18]
disease <- Biobase::exprs(cal_filtered)[,19:37]
fold_change_by_mean <- rowMeans(normal)-rowMeans(disease)
FCl <- fold_change_by_mean[which(fold_change_by_mean < -1.5)]
FCg <- fold_change_by_mean[which(fold_change_by_mean > 1.5)]
logFC <- c(FCl,FCg)
logFC
## 211506_s_at 213524_s_at
## -1.882412 -1.962624
#=========================================
# ~~ Step 4 a2:
# We Observe No change due to filtering, and the logFC is still not correct
# Suspected that the splitting of the groups in the paper is not correct:
# I split according to Control-Patient instead of Disease-Normal:
normal <- Biobase::exprs(cal_data)[,str_detect(Biobase::pData(raw_data)$Comment..Sample_source_name.,pattern="Controls")]
disease <- Biobase::exprs(cal_data)[,str_detect(Biobase::pData(raw_data)$Comment..Sample_source_name.,pattern="patients")]
fold_change_by_mean <- rowMeans(normal) - rowMeans(disease)
FCl <- fold_change_by_mean[which(fold_change_by_mean < -1.5)]
FCg <- fold_change_by_mean[which(fold_change_by_mean > 1.5)]
logFC <- c(FCl,FCg)
length(logFC)
## [1] 723
Present <- lapply(probes,function(x){
any(names(logFC)==x)
})
# Check how many are Present: Must be around 95
length(which(Present==TRUE))
## [1] 1
# It appears that they also did not do this...
#=========================================
# ~~ Step 4 a3: On the cal_data before filtering Using again Control as Normal vs Disease
normal <- Biobase::exprs(cal_data)[,1:18]
disease <- Biobase::exprs(cal_data)[,19:37]
FC <- rowMeans(disease) - rowMeans(normal)
# One by One Trail to Compare Results:
#For 219326_s_at
FC["219326_s_at"]
## 219326_s_at
## 0.6126175
# VS PAPER FC = 1.53
#For 205681_at
FC["205681_at"]
## 205681_at
## 0.7913014
# VS PAPER FC = 1.73
#Notice: All my values = log2(FC) => My FC != FC_real but it is a logFC
#The paper follows a cut off of FC > 1.5 => logFC > log(1.5, base=2)
logFC_downR <- FC[which(FC <= -log(1.5,base=2))]
logFC_upR <- FC[which(FC >= log(1.5,base=2))]
logFC <- c(logFC_downR,logFC_upR)
#Check if they are Present in the Final Probes Found by the Paper
Present <- lapply(probes,function(x){
any(names(logFC)==x)
})
# Check how many are Present: Must be exactly 95
length(which(Present==TRUE))
## [1] 95
# SUCCESS
# Converting to FC:
FC <- 2^abs(logFC)
# Filtering the Data According to FC:
rows_select <- lapply(rownames(Biobase::exprs(cal_filtered)),function(x){
any(names(logFC)==x)
})
rows_select <- unlist(rows_select)
calFC_filtered <- subset(cal_filtered, rows_select)
#Re-Check for Presence of all Probes:
Present <- lapply(probes,function(x){
any(rownames(Biobase::exprs(calFC_filtered))==x)
})
# Check how many are Present: Must be around 95
length(which(Present==TRUE))
## [1] 95
#=========================================
#=========================================
# ~~Step 4 b: Now that the data is filtered by FC, we apply Wilcox Test to filter by p-value<0.05
p_values <- apply(Biobase::exprs(calFC_filtered),1, function(x){
ret <- wilcox.test(x[1:18],x[19:37])$p.value
})
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
## Warning in wilcox.test.default(x[1:18], x[19:37]): cannot compute exact p-value
## with ties
# Adjusting Using FDR:
p_adj <- p.adjust(p_values, method="fdr")
# Extracting Significant P_Values:
p_signif <- subset(p_adj,p_adj <= 0.05)
# Checking Number of Significant p_values: Must be between 80-100
length(p_signif)
## [1] 97
#Check if the data still contains the Final Probes found by the Paper:
Present_pV <- lapply(probes,function(x){
any(names(p_signif)==x)
})
# Check how many are Present: Must be between 80-100
length(which(Present_pV==TRUE))
## [1] 94
# List of Correctly Identified Probes Like the Paper: True Positives, Significant both here and in the paper
tp <- probes[unlist(Present_pV)]
tp
## [1] "219326_s_at" "205681_at" "215440_s_at" "1554229_at" "202284_s_at"
## [6] "208791_at" "208792_s_at" "226702_at" "225557_at" "211919_s_at"
## [11] "208811_s_at" "204751_x_at" "226817_at" "201044_x_at" "209457_at"
## [16] "219872_at" "207674_at" "211307_s_at" "221345_at" "209189_at"
## [21] "213524_s_at" "207387_s_at" "208524_at" "211555_s_at" "214455_at"
## [26] "1555464_at" "211506_s_at" "220266_s_at" "208960_s_at" "217738_at"
## [31] "217739_s_at" "243296_at" "203574_at" "216015_s_at" "205660_at"
## [36] "224102_at" "201120_s_at" "210845_s_at" "211924_s_at" "204285_s_at"
## [41] "202014_at" "37028_at" "1554997_a_at" "200730_s_at" "1569599_at"
## [46] "222088_s_at" "215223_s_at" "205214_at" "210190_at" "1552542_s_at"
## [51] "235086_at" "221060_s_at" "206116_s_at" "241133_at" "239661_at"
## [56] "220467_at" "236921_at" "238807_at" "216198_at" "236307_at"
## [61] "236796_at" "244172_at" "227576_at" "244425_at" "238635_at"
## [66] "232330_at" "239545_at" "230653_at" "244876_at" "230983_at"
## [71] "1563674_at" "228623_at" "1562289_at" "206785_s_at" "224559_at"
## [76] "243736_at" "232478_at" "228623_at" "243310_at" "239673_at"
## [81] "240128_at" "1559054_a_at" "241838_at" "242239_at" "228390_at"
## [86] "213939_s_at" "244267_at" "236561_at" "236427_at" "1556543_at"
## [91] "228157_at" "236562_at" "240155_x_at" "1558486_at"
# List of Probes that are not Identified but shouldve been: False Negatives, Non-Sign here but is in the Paper
fn <- probes[!unlist(Present_pV)]
fn
## [1] "1564164_at" "238875_at"
# List of Falsley Identified Probes: False Positives, Signficant Here but not in the Paper
fp_indices <- lapply(names(p_signif),function(x){
any(tp==x)
})
fp_indices <- unlist(fp_indices)
fp_indices <- which(fp_indices==FALSE)
fp <- names(p_signif[fp_indices])
fp
## [1] "1569703_a_at" "214073_at" "220187_at" "244383_at"
#---------------------------------
# Summary: Values might be off - or + [1-2]
# True Positives Count:
length(tp)
## [1] 94
# False Positive Count:
length(fp)
## [1] 4
# False Negative Count:
length(fn)
## [1] 2
# True Negative Count:
nrow(Biobase::exprs(calFC_filtered))-(length(p_signif)+length(fn))
## [1] 293
#---------------------------------
# Filtering the Data by Significant P-Values:
rows_select <- lapply(rownames(Biobase::exprs(calFC_filtered)),function(x){
any(names(p_signif)==x)
})
rows_select <- unlist(rows_select)
#Checking if the number of probes is correct:
length(which(rows_select==TRUE))
## [1] 97
#Subsetting:
final_filtered <- subset(calFC_filtered, rows_select)
#=========================================
#=========================================
# ~~ Step 6: Annotation and Gene Mapping
# The Paper maps the probes using several Tools: AILUN, Biomart, and GATExplorer
# We chose to use AnnotationDbi and the respective array db hgu133plus2.db for more familiar usage:
# Method 1:
anno_palmieri <- AnnotationDbi::select(hgu133plus2.db,
keys = (featureNames(final_filtered)),
columns = c("SYMBOL", "GENENAME"),
keytype = "PROBEID")
## 'select()' returned 1:many mapping between keys and columns
# Removing NA Mapped Probes:
anno_data <- subset(anno_palmieri, !is.na(SYMBOL))
# WITHOUT REMOVING MULTIPLE MAPPING PROBES:
#Attach P-Values:
anno_data <- cbind(anno_data,p_signif[anno_data$PROBEID])
# Attach FC VAlues:
anno_data <- cbind(anno_data,FC[anno_data$PROBEID])
# Attach Log FC
anno_data <- cbind(anno_data, logFC[anno_data$PROBEID])
#Set Col names
colnames(anno_data) <- c("PROBEID", "SYMBOL", "GENENAME", "P-VALUE", "FC", "logFC")
# Down_regulated:
down_reg <-anno_data[order(anno_data$logFC),]
down_reg <-down_reg[1:23,]
down_reg
## PROBEID SYMBOL
## 87 238635_at TMEM267
## 73 228157_at ZNF207
## 102 244267_at SATB1
## 24 206785_s_at KLRC2
## 25 206785_s_at KLRC1
## 68 224559_at MALAT1
## 9 1563674_at FCRL2
## 83 236561_at TGFBR1
## 77 230983_at NIBAN3
## 78 232330_at COA1
## 74 228390_at RAB30
## 97 242239_at NSUN6
## 6 1558486_at ZNF493
## 94 240155_x_at ZNF493
## 88 238807_at GAPDHP62
## 89 238807_at ANKRD46
## 79 232478_at MIR181A2HG
## 84 236562_at ZNF439
## 44 213939_s_at RUFY3
## 21 205660_at OASL
## 59 219872_at GASK1B
## 53 215440_s_at BEX4
## 45 214073_at CTTN
## GENENAME P-VALUE
## 87 transmembrane protein 267 4.303151e-06
## 73 zinc finger protein 207 2.549691e-02
## 102 SATB homeobox 1 2.271611e-02
## 24 killer cell lectin like receptor C2 3.519607e-02
## 25 killer cell lectin like receptor C1 3.519607e-02
## 68 metastasis associated lung adenocarcinoma transcript 1 2.978387e-02
## 9 Fc receptor like 2 4.386966e-02
## 83 transforming growth factor beta receptor 1 3.095888e-02
## 77 niban apoptosis regulator 3 4.386966e-02
## 78 cytochrome c oxidase assembly factor 1 homolog 4.612963e-02
## 74 RAB30, member RAS oncogene family 3.624755e-02
## 97 NOP2/Sun RNA methyltransferase 6 8.504442e-03
## 6 zinc finger protein 493 8.504442e-03
## 94 zinc finger protein 493 9.094277e-03
## 88 glyceraldehyde 3 phosphate dehydrogenase pseudogene 62 4.328194e-02
## 89 ankyrin repeat domain 46 4.328194e-02
## 79 MIR181A2 host gene 1.594047e-02
## 84 zinc finger protein 439 1.497014e-02
## 44 RUN and FYVE domain containing 3 3.624755e-02
## 21 2'-5'-oligoadenylate synthetase like 3.519607e-02
## 59 golgi associated kinase 1B 7.533397e-04
## 53 brain expressed X-linked 4 1.465427e-03
## 45 cortactin 4.612963e-02
## FC logFC
## 87 2.017640 -1.0126687
## 73 2.008657 -1.0062312
## 102 1.817717 -0.8621272
## 24 1.686105 -0.7536946
## 25 1.686105 -0.7536946
## 68 1.680600 -0.7489760
## 9 1.671209 -0.7408920
## 83 1.663156 -0.7339235
## 77 1.625080 -0.7005103
## 78 1.584004 -0.6635757
## 74 1.568395 -0.6492892
## 97 1.556362 -0.6381777
## 6 1.534127 -0.6174184
## 94 1.528619 -0.6122285
## 88 1.525809 -0.6095745
## 89 1.525809 -0.6095745
## 79 1.520219 -0.6042796
## 84 1.509712 -0.5942735
## 44 1.503484 -0.5883094
## 21 1.503496 0.5883207
## 59 1.503950 0.5887565
## 53 1.503956 0.5887622
## 45 1.511617 0.5960924
# Up_regulated:
up_reg <-anno_data[order(anno_data$logFC),]
up_reg <-up_reg[23:82,]
up_reg
## PROBEID SYMBOL
## 45 214073_at CTTN
## 2 1554229_at CREBRF
## 18 204285_s_at PMAIP1
## 54 216015_s_at NLRP3
## 31 208811_s_at TMEM135
## 32 208811_s_at DNAJB6
## 58 219326_s_at B3GNT2
## 70 226702_at CMPK2
## 14 201120_s_at PGRMC1
## 4 1555464_at IFIH1
## 17 203574_at NFIL3
## 65 222088_s_at SLC2A14
## 66 222088_s_at SLC2A3
## 35 209457_at DUSP5
## 67 224102_at P2RY12
## 51 215223_s_at SOD2
## 52 215223_s_at SOD2-OT1
## 40 211555_s_at GUCY1B1
## 69 225557_at CSRNP1
## 23 206116_s_at TPM1
## 41 211919_s_at CXCR4
## 37 210845_s_at PLAUR
## 20 205214_at STK17B
## 26 207387_s_at GK
## 61 220266_s_at KLF4
## 16 202284_s_at CDKN1A
## 12 200730_s_at PTP4A1
## 60 220187_at STEAP4
## 36 210190_at STX11
## 28 208524_at GPR15
## 1 1552542_s_at TAGAP
## 19 204751_x_at DSC2
## 34 209189_at FOS
## 38 211307_s_at FCAR
## 10 1569599_at SAMSN1
## 22 205681_at BCL2A1
## 56 217738_at NAMPT
## 42 211924_s_at PLAUR
## 57 217739_s_at NAMPT
## 63 221060_s_at TLR4
## 30 208792_s_at CLU
## 29 208791_at CLU
## 106 37028_at PPP1R15A
## 46 214455_at H2BC4
## 47 214455_at H2BC8
## 48 214455_at H2BC7
## 49 214455_at H2BC6
## 50 214455_at H2BC10
## 33 208960_s_at KLF6
## 95 241133_at TRBV27
## 13 201044_x_at DUSP1
## 15 202014_at PPP1R15A
## 27 207674_at FCAR
## 71 226817_at DSC2
## 98 243296_at NAMPT
## 80 235086_at THBS1
## 64 221345_at FFAR2
## 3 1554997_a_at PTGS2
## 39 211506_s_at CXCL8
## 43 213524_s_at G0S2
## GENENAME P-VALUE
## 45 cortactin 0.046129634
## 2 CREB3 regulatory factor 0.001575417
## 18 phorbol-12-myristate-13-acetate-induced protein 1 0.002741258
## 54 NLR family pyrin domain containing 3 0.043869660
## 31 transmembrane protein 135 0.001575417
## 32 DnaJ heat shock protein family (Hsp40) member B6 0.001575417
## 58 UDP-GlcNAc:betaGal beta-1,3-N-acetylglucosaminyltransferase 2 0.030958877
## 70 cytidine/uridine monophosphate kinase 2 0.037645745
## 14 progesterone receptor membrane component 1 0.004844073
## 4 interferon induced with helicase C domain 1 0.025496913
## 17 nuclear factor, interleukin 3 regulated 0.004687073
## 65 solute carrier family 2 member 14 0.030958877
## 66 solute carrier family 2 member 3 0.030958877
## 35 dual specificity phosphatase 5 0.005549701
## 67 purinergic receptor P2Y12 0.018619131
## 51 superoxide dismutase 2 0.037645745
## 52 SOD2 overlapping transcript 1 0.037645745
## 40 guanylate cyclase 1 soluble subunit beta 1 0.008504442
## 69 cysteine and serine rich nuclear protein 1 0.029783866
## 23 tropomyosin 1 0.010002391
## 41 C-X-C motif chemokine receptor 4 0.040277643
## 37 plasminogen activator, urokinase receptor 0.001575417
## 20 serine/threonine kinase 17b 0.003428028
## 26 glycerol kinase 0.030958877
## 61 Kruppel like factor 4 0.033780487
## 16 cyclin dependent kinase inhibitor 1A 0.021242178
## 12 protein tyrosine phosphatase 4A1 0.028576381
## 60 STEAP4 metalloreductase 0.028576381
## 36 syntaxin 11 0.001465427
## 28 G protein-coupled receptor 15 0.021242178
## 1 T cell activation RhoGTPase activating protein 0.001113954
## 19 desmocollin 2 0.004302779
## 34 Fos proto-oncogene, AP-1 transcription factor subunit 0.006989584
## 38 Fc fragment of IgA receptor 0.008504442
## 10 SAM domain, SH3 domain and nuclear localization signals 1 0.003428028
## 22 BCL2 related protein A1 0.004844073
## 56 nicotinamide phosphoribosyltransferase 0.025496913
## 42 plasminogen activator, urokinase receptor 0.001465427
## 57 nicotinamide phosphoribosyltransferase 0.017000369
## 63 toll like receptor 4 0.021242178
## 30 clusterin 0.037645745
## 29 clusterin 0.024834073
## 106 protein phosphatase 1 regulatory subunit 15A 0.005549701
## 46 H2B clustered histone 4 0.044572107
## 47 H2B clustered histone 8 0.044572107
## 48 H2B clustered histone 7 0.044572107
## 49 H2B clustered histone 6 0.044572107
## 50 H2B clustered histone 10 0.044572107
## 33 Kruppel like factor 6 0.037645745
## 95 T cell receptor beta variable 27 0.048537679
## 13 dual specificity phosphatase 1 0.043869660
## 15 protein phosphatase 1 regulatory subunit 15A 0.008504442
## 27 Fc fragment of IgA receptor 0.008504442
## 71 desmocollin 2 0.003957592
## 98 nicotinamide phosphoribosyltransferase 0.005549701
## 80 thrombospondin 1 0.036247554
## 64 free fatty acid receptor 2 0.003957592
## 3 prostaglandin-endoperoxide synthase 2 0.022716107
## 39 C-X-C motif chemokine ligand 8 0.017000369
## 43 G0/G1 switch 2 0.021242178
## FC logFC
## 45 1.511617 0.5960924
## 2 1.511622 0.5960971
## 18 1.514455 0.5987991
## 54 1.517349 0.6015533
## 31 1.524202 0.6080539
## 32 1.524202 0.6080539
## 58 1.529031 0.6126175
## 70 1.538064 0.6211151
## 14 1.540413 0.6233174
## 4 1.546469 0.6289783
## 17 1.553001 0.6350588
## 65 1.556597 0.6383957
## 66 1.556597 0.6383957
## 35 1.560783 0.6422699
## 67 1.561141 0.6426006
## 51 1.573518 0.6539934
## 52 1.573518 0.6539934
## 40 1.575685 0.6559795
## 69 1.576315 0.6565561
## 23 1.579254 0.6592430
## 41 1.588171 0.6673666
## 37 1.600316 0.6783564
## 20 1.603603 0.6813174
## 26 1.606814 0.6842026
## 61 1.608409 0.6856340
## 16 1.616663 0.6930186
## 12 1.618465 0.6946263
## 60 1.641161 0.7147165
## 36 1.657180 0.7287303
## 28 1.658588 0.7299552
## 1 1.664672 0.7352382
## 19 1.684992 0.7527415
## 34 1.697467 0.7633832
## 38 1.705582 0.7702645
## 10 1.717568 0.7803672
## 22 1.730635 0.7913014
## 56 1.734534 0.7945477
## 42 1.762189 0.8173689
## 57 1.797565 0.8460442
## 63 1.825701 0.8684508
## 30 1.830398 0.8721577
## 29 1.842551 0.8817044
## 106 1.867023 0.9007397
## 46 1.876363 0.9079390
## 47 1.876363 0.9079390
## 48 1.876363 0.9079390
## 49 1.876363 0.9079390
## 50 1.876363 0.9079390
## 33 1.877675 0.9089475
## 95 1.911669 0.9348325
## 13 1.940114 0.9561412
## 15 1.957603 0.9690885
## 27 1.984944 0.9890985
## 71 1.991323 0.9937276
## 98 2.002575 1.0018562
## 80 2.111648 1.0783695
## 64 2.344557 1.2293156
## 3 2.770057 1.4699157
## 39 3.686911 1.8824124
## 43 3.897703 1.9626242
#----------------------------------
#----------------------------------
# ~Removing Multiple Mapping Probes:
anno_palmieri <- subset(anno_palmieri, !is.na(SYMBOL))
# Group by ID
anno_grouped <- group_by(anno_palmieri, PROBEID)
# Summarize data
anno_summarized <-
dplyr::summarize(anno_grouped, no_of_matches = n_distinct(SYMBOL))
## `summarise()` ungrouping output (override with `.groups` argument)
head(anno_summarized)
## # A tibble: 6 x 2
## PROBEID no_of_matches
## <chr> <int>
## 1 1552542_s_at 1
## 2 1554229_at 1
## 3 1554997_a_at 1
## 4 1555464_at 1
## 5 1558486_at 1
## 6 1563674_at 1
# Filter, only 1 gene mapping allowed
anno_filtered <- filter(anno_summarized, no_of_matches > 1)
head(anno_filtered)
## # A tibble: 6 x 2
## PROBEID no_of_matches
## <chr> <int>
## 1 206785_s_at 2
## 2 208811_s_at 2
## 3 214455_at 5
## 4 215223_s_at 2
## 5 222088_s_at 2
## 6 238807_at 2
# Get Ids to exclude:
probe_stats <- anno_filtered
ids_to_exclude <- probe_stats[,1]
ids_to_exclude <- unlist(ids_to_exclude)
# Filter them out
indices_to_remove <- lapply(ids_to_exclude,
function(x){
which(anno_palmieri$PROBEID==x) })
indices_to_remove <- unlist(indices_to_remove)
# Remove Indices
f_data <- anno_palmieri[-indices_to_remove,]
#Attach P-Values:
f_data <- cbind(f_data,p_signif[f_data$PROBEID])
# Attach FC VAlues:
f_data <- cbind(f_data,FC[f_data$PROBEID])
# Attach Log FC
f_data <- cbind(f_data, logFC[f_data$PROBEID])
#Set Col names
colnames(f_data) <- c("PROBEID", "SYMBOL", "GENENAME", "P-VALUE", "FC", "logFC")
# Down_regulated:
f_down_reg <-f_data[order(f_data$logFC),]
f_down_reg <-f_down_reg[1:18,]
f_down_reg <- f_down_reg[order(f_down_reg$FC),]
# Up_regulated:
f_up_reg <-f_data[order(f_data$logFC),]
f_up_reg <-f_up_reg[18:67,]
f_up_reg
## PROBEID SYMBOL
## 53 215440_s_at BEX4
## 45 214073_at CTTN
## 2 1554229_at CREBRF
## 18 204285_s_at PMAIP1
## 54 216015_s_at NLRP3
## 58 219326_s_at B3GNT2
## 70 226702_at CMPK2
## 14 201120_s_at PGRMC1
## 4 1555464_at IFIH1
## 17 203574_at NFIL3
## 35 209457_at DUSP5
## 67 224102_at P2RY12
## 40 211555_s_at GUCY1B1
## 69 225557_at CSRNP1
## 23 206116_s_at TPM1
## 41 211919_s_at CXCR4
## 37 210845_s_at PLAUR
## 20 205214_at STK17B
## 26 207387_s_at GK
## 61 220266_s_at KLF4
## 16 202284_s_at CDKN1A
## 12 200730_s_at PTP4A1
## 60 220187_at STEAP4
## 36 210190_at STX11
## 28 208524_at GPR15
## 1 1552542_s_at TAGAP
## 19 204751_x_at DSC2
## 34 209189_at FOS
## 38 211307_s_at FCAR
## 10 1569599_at SAMSN1
## 22 205681_at BCL2A1
## 56 217738_at NAMPT
## 42 211924_s_at PLAUR
## 57 217739_s_at NAMPT
## 63 221060_s_at TLR4
## 30 208792_s_at CLU
## 29 208791_at CLU
## 106 37028_at PPP1R15A
## 33 208960_s_at KLF6
## 95 241133_at TRBV27
## 13 201044_x_at DUSP1
## 15 202014_at PPP1R15A
## 27 207674_at FCAR
## 71 226817_at DSC2
## 98 243296_at NAMPT
## 80 235086_at THBS1
## 64 221345_at FFAR2
## 3 1554997_a_at PTGS2
## 39 211506_s_at CXCL8
## 43 213524_s_at G0S2
## GENENAME P-VALUE
## 53 brain expressed X-linked 4 0.001465427
## 45 cortactin 0.046129634
## 2 CREB3 regulatory factor 0.001575417
## 18 phorbol-12-myristate-13-acetate-induced protein 1 0.002741258
## 54 NLR family pyrin domain containing 3 0.043869660
## 58 UDP-GlcNAc:betaGal beta-1,3-N-acetylglucosaminyltransferase 2 0.030958877
## 70 cytidine/uridine monophosphate kinase 2 0.037645745
## 14 progesterone receptor membrane component 1 0.004844073
## 4 interferon induced with helicase C domain 1 0.025496913
## 17 nuclear factor, interleukin 3 regulated 0.004687073
## 35 dual specificity phosphatase 5 0.005549701
## 67 purinergic receptor P2Y12 0.018619131
## 40 guanylate cyclase 1 soluble subunit beta 1 0.008504442
## 69 cysteine and serine rich nuclear protein 1 0.029783866
## 23 tropomyosin 1 0.010002391
## 41 C-X-C motif chemokine receptor 4 0.040277643
## 37 plasminogen activator, urokinase receptor 0.001575417
## 20 serine/threonine kinase 17b 0.003428028
## 26 glycerol kinase 0.030958877
## 61 Kruppel like factor 4 0.033780487
## 16 cyclin dependent kinase inhibitor 1A 0.021242178
## 12 protein tyrosine phosphatase 4A1 0.028576381
## 60 STEAP4 metalloreductase 0.028576381
## 36 syntaxin 11 0.001465427
## 28 G protein-coupled receptor 15 0.021242178
## 1 T cell activation RhoGTPase activating protein 0.001113954
## 19 desmocollin 2 0.004302779
## 34 Fos proto-oncogene, AP-1 transcription factor subunit 0.006989584
## 38 Fc fragment of IgA receptor 0.008504442
## 10 SAM domain, SH3 domain and nuclear localization signals 1 0.003428028
## 22 BCL2 related protein A1 0.004844073
## 56 nicotinamide phosphoribosyltransferase 0.025496913
## 42 plasminogen activator, urokinase receptor 0.001465427
## 57 nicotinamide phosphoribosyltransferase 0.017000369
## 63 toll like receptor 4 0.021242178
## 30 clusterin 0.037645745
## 29 clusterin 0.024834073
## 106 protein phosphatase 1 regulatory subunit 15A 0.005549701
## 33 Kruppel like factor 6 0.037645745
## 95 T cell receptor beta variable 27 0.048537679
## 13 dual specificity phosphatase 1 0.043869660
## 15 protein phosphatase 1 regulatory subunit 15A 0.008504442
## 27 Fc fragment of IgA receptor 0.008504442
## 71 desmocollin 2 0.003957592
## 98 nicotinamide phosphoribosyltransferase 0.005549701
## 80 thrombospondin 1 0.036247554
## 64 free fatty acid receptor 2 0.003957592
## 3 prostaglandin-endoperoxide synthase 2 0.022716107
## 39 C-X-C motif chemokine ligand 8 0.017000369
## 43 G0/G1 switch 2 0.021242178
## FC logFC
## 53 1.503956 0.5887622
## 45 1.511617 0.5960924
## 2 1.511622 0.5960971
## 18 1.514455 0.5987991
## 54 1.517349 0.6015533
## 58 1.529031 0.6126175
## 70 1.538064 0.6211151
## 14 1.540413 0.6233174
## 4 1.546469 0.6289783
## 17 1.553001 0.6350588
## 35 1.560783 0.6422699
## 67 1.561141 0.6426006
## 40 1.575685 0.6559795
## 69 1.576315 0.6565561
## 23 1.579254 0.6592430
## 41 1.588171 0.6673666
## 37 1.600316 0.6783564
## 20 1.603603 0.6813174
## 26 1.606814 0.6842026
## 61 1.608409 0.6856340
## 16 1.616663 0.6930186
## 12 1.618465 0.6946263
## 60 1.641161 0.7147165
## 36 1.657180 0.7287303
## 28 1.658588 0.7299552
## 1 1.664672 0.7352382
## 19 1.684992 0.7527415
## 34 1.697467 0.7633832
## 38 1.705582 0.7702645
## 10 1.717568 0.7803672
## 22 1.730635 0.7913014
## 56 1.734534 0.7945477
## 42 1.762189 0.8173689
## 57 1.797565 0.8460442
## 63 1.825701 0.8684508
## 30 1.830398 0.8721577
## 29 1.842551 0.8817044
## 106 1.867023 0.9007397
## 33 1.877675 0.9089475
## 95 1.911669 0.9348325
## 13 1.940114 0.9561412
## 15 1.957603 0.9690885
## 27 1.984944 0.9890985
## 71 1.991323 0.9937276
## 98 2.002575 1.0018562
## 80 2.111648 1.0783695
## 64 2.344557 1.2293156
## 3 2.770057 1.4699157
## 39 3.686911 1.8824124
## 43 3.897703 1.9626242
#===================================
#===================================
# ~~ Reading the Paper's Data
upR_paper <- read.xlsx("C:\\Users\\Hamda\\Desktop\\University\\2nd Year\\Fall Semester\\Genomics and Data Mining\\R Workplace\\Genomics\\bif425_fall20_hamdanieh_bilal\\MicroArray_Project\\UpRegulated_in_Paper.xlsx")
#Set Col names
colnames(upR_paper) <- c("PROBEID", "GENENAME", "SYMBOL", "P-VALUE", "FC")
# Sort according to FC, High to Low
upR_paper <-upR_paper[order(upR_paper$FC),]
downR_paper <- read.xlsx("C:\\Users\\Hamda\\Desktop\\University\\2nd Year\\Fall Semester\\Genomics and Data Mining\\R Workplace\\Genomics\\bif425_fall20_hamdanieh_bilal\\MicroArray_Project\\DownRegulated_in_Paper.xlsx")
#Set Col names
colnames(downR_paper) <- c("PROBEID", "GENENAME", "SYMBOL", "P-VALUE", "FC")
downR_paper <- downR_paper[order(downR_paper$FC),]
#--------------------
# Down Regulated Differences:
# OUR DOWN REGULATED:
f_down_reg[,c(1,5)]
## PROBEID FC
## 44 213939_s_at 1.503484
## 21 205660_at 1.503496
## 59 219872_at 1.503950
## 53 215440_s_at 1.503956
## 84 236562_at 1.509712
## 79 232478_at 1.520219
## 94 240155_x_at 1.528619
## 6 1558486_at 1.534127
## 97 242239_at 1.556362
## 74 228390_at 1.568395
## 78 232330_at 1.584004
## 77 230983_at 1.625080
## 83 236561_at 1.663156
## 9 1563674_at 1.671209
## 68 224559_at 1.680600
## 102 244267_at 1.817717
## 73 228157_at 2.008657
## 87 238635_at 2.017640
# PAPERS DOWN REGULATED:
downR_paper[,c(1,5)]
## PROBEID FC
## 4 216198_at 1.50
## 15 244876_at 1.50
## 29 238875_at 1.50
## 33 213939_s_at 1.50
## 39 236562_at 1.51
## 23 232478_at 1.52
## 25 243310_at 1.52
## 40 240155_x_at 1.52
## 3 238807_at 1.53
## 13 1564164_at 1.53
## 41 1558486_at 1.53
## 30 241838_at 1.54
## 31 242239_at 1.55
## 1 220467_at 1.56
## 27 240128_at 1.57
## 32 228390_at 1.57
## 11 232330_at 1.58
## 22 243736_at 1.59
## 28 1559054_a_at 1.60
## 36 236427_at 1.61
## 16 230983_at 1.63
## 2 236921_at 1.64
## 12 239545_at 1.64
## 9 244425_at 1.65
## 19 1562289_at 1.66
## 35 236561_at 1.66
## 17 1563674_at 1.68
## 21 224559_at 1.68
## 37 1556543_at 1.68
## 20 206785_s_at 1.69
## 6 236796_at 1.70
## 14 230653_at 1.77
## 7 244172_at 1.78
## 34 244267_at 1.81
## 18 228623_at 1.82
## 24 228623_at 1.82
## 8 227576_at 1.83
## 26 239673_at 1.94
## 38 228157_at 2.01
## 10 238635_at 2.03
## 5 236307_at 2.11
# Get Down Regulated IDs:
dR_O <- f_down_reg[,1]
dR_P <- downR_paper[,1]
dRPresent <- lapply(dR_P, function(x){
any(dR_O==x)
})
dRPresent <- unlist(dRPresent)
# Number of Matches:
length(which(dRPresent==TRUE))
## [1] 15
# Number of DownRegulated that were Missed:
length(which(dRPresent==FALSE))
## [1] 26
# This is Definitely Due to the Annotation Step since the paper Merges between several databases, while we only do one.
# We prove this by comparing the FC Values:
matched <- subset(dR_P, dRPresent)
# Get The Indices of the Matched Probes:
indicesO <- apply(f_down_reg,1,function(x){
any(matched==x[1])
})
indicesP <- apply(downR_paper,1,function(x){
any(matched==x[1])
})
# Get our FC Values
fcO <- f_down_reg[,5]
fcO <- fcO[indicesO]
fcO <- round(fcO,digits=2)
# Get Papers FC Values
fcP <- downR_paper[,5]
fcP <- fcP[indicesP]
# Calculate the Difference between FCs
fcDiff <- fcO-fcP
# Table to Show the Summary:
table_diff <- cbind(matched,fcO,fcP,fcDiff)
colnames(table_diff) <- c("ProbeIDs-Mathced", "Our FC", "Paper's FC", "FC Difference")
table_diff <- as.data.frame(table_diff)
table_diff
## ProbeIDs-Mathced Our FC Paper's FC FC Difference
## 1 213939_s_at 1.5 1.5 0
## 2 236562_at 1.51 1.51 0
## 3 232478_at 1.52 1.52 0
## 4 240155_x_at 1.53 1.52 0.01
## 5 1558486_at 1.53 1.53 0
## 6 242239_at 1.56 1.55 0.01
## 7 228390_at 1.57 1.57 0
## 8 232330_at 1.58 1.58 0
## 9 230983_at 1.63 1.63 0
## 10 236561_at 1.66 1.66 0
## 11 1563674_at 1.67 1.68 -0.01
## 12 224559_at 1.68 1.68 0
## 13 244267_at 1.82 1.81 0.01
## 14 228157_at 2.01 2.01 0
## 15 238635_at 2.02 2.03 -0.00999999999999979
# We can see the FC Difference is nearly 0 => our Data so far matches the Paper's Data
# If we Correct for the Annotation, we will get identical Data.
# The Paper Maps the Probe IDs Manually through GEO and other Tools.
#--------------------
#--------------------
# Up Regulated Differences:
# OUR UP REGULATED:
f_up_reg[,c(1,5)]
## PROBEID FC
## 53 215440_s_at 1.503956
## 45 214073_at 1.511617
## 2 1554229_at 1.511622
## 18 204285_s_at 1.514455
## 54 216015_s_at 1.517349
## 58 219326_s_at 1.529031
## 70 226702_at 1.538064
## 14 201120_s_at 1.540413
## 4 1555464_at 1.546469
## 17 203574_at 1.553001
## 35 209457_at 1.560783
## 67 224102_at 1.561141
## 40 211555_s_at 1.575685
## 69 225557_at 1.576315
## 23 206116_s_at 1.579254
## 41 211919_s_at 1.588171
## 37 210845_s_at 1.600316
## 20 205214_at 1.603603
## 26 207387_s_at 1.606814
## 61 220266_s_at 1.608409
## 16 202284_s_at 1.616663
## 12 200730_s_at 1.618465
## 60 220187_at 1.641161
## 36 210190_at 1.657180
## 28 208524_at 1.658588
## 1 1552542_s_at 1.664672
## 19 204751_x_at 1.684992
## 34 209189_at 1.697467
## 38 211307_s_at 1.705582
## 10 1569599_at 1.717568
## 22 205681_at 1.730635
## 56 217738_at 1.734534
## 42 211924_s_at 1.762189
## 57 217739_s_at 1.797565
## 63 221060_s_at 1.825701
## 30 208792_s_at 1.830398
## 29 208791_at 1.842551
## 106 37028_at 1.867023
## 33 208960_s_at 1.877675
## 95 241133_at 1.911669
## 13 201044_x_at 1.940114
## 15 202014_at 1.957603
## 27 207674_at 1.984944
## 71 226817_at 1.991323
## 98 243296_at 2.002575
## 80 235086_at 2.111648
## 64 221345_at 2.344557
## 3 1554997_a_at 2.770057
## 39 211506_s_at 3.686911
## 43 213524_s_at 3.897703
# PAPERS UP REGULATD
upR_paper[,c(1,5)]
## PROBEID FC
## 2 215440_s_at 1.50
## 34 205660_at 1.50
## 3 1554229_at 1.51
## 15 219872_at 1.51
## 33 216015_s_at 1.52
## 39 204285_s_at 1.52
## 10 208811_s_at 1.53
## 7 226702_at 1.54
## 36 201120_s_at 1.54
## 25 1555464_at 1.55
## 32 203574_at 1.55
## 14 209457_at 1.56
## 35 224102_at 1.56
## 45 222088_s_at 1.56
## 8 225557_at 1.57
## 46 215223_s_at 1.57
## 52 206116_s_at 1.58
## 9 211919_s_at 1.59
## 23 211555_s_at 1.59
## 37 210845_s_at 1.60
## 47 205214_at 1.60
## 21 207387_s_at 1.61
## 27 220266_s_at 1.61
## 4 202284_s_at 1.62
## 43 200730_s_at 1.62
## 48 210190_at 1.65
## 22 208524_at 1.66
## 49 1552542_s_at 1.66
## 11 204751_x_at 1.68
## 19 209189_at 1.70
## 17 211307_s_at 1.72
## 44 1569599_at 1.72
## 1 205681_at 1.73
## 29 217738_at 1.73
## 38 211924_s_at 1.76
## 30 217739_s_at 1.80
## 51 221060_s_at 1.82
## 6 208792_s_at 1.83
## 5 208791_at 1.84
## 41 37028_at 1.87
## 24 214455_at 1.88
## 28 208960_s_at 1.88
## 53 241133_at 1.91
## 13 201044_x_at 1.94
## 40 202014_at 1.96
## 12 226817_at 1.99
## 16 207674_at 1.99
## 31 243296_at 2.00
## 50 235086_at 2.12
## 18 221345_at 2.34
## 42 1554997_a_at 2.77
## 26 211506_s_at 3.69
## 20 213524_s_at 3.90
# Get Up Regulated Probe IDs:
uR_O <- f_up_reg[,1]
uR_P <- upR_paper[,1]
uRPresent <- lapply(uR_O, function(x){
any(uR_P==x)
})
uRPresent <- unlist(uRPresent)
# Number of Matches:
length(which(uRPresent==TRUE))
## [1] 47
# Number of Upregulated that were Extra:
length(which(uRPresent==FALSE))
## [1] 3
# This is Definitely Due to the Annotation Step since the paper Merges between several databases, while we only do one.
# We prove this by comparing the FC Values:
matched_up <- subset(uR_O, uRPresent)
# Get The Indices of the Matched Probes:
indicesO_Up <- apply(f_up_reg,1,function(x){
any(matched_up==x[1])
})
indicesP_Up <- apply(upR_paper,1,function(x){
any(matched_up==x[1])
})
# Get our FC Values
fcO <- f_up_reg[,5]
fcO <- fcO[indicesO_Up]
fcO <- round(fcO,digits=2)
# Get Papers FC Values
fcP <- upR_paper[,5]
fcP <- fcP[indicesP_Up]
# Calculate the Difference between FCs
fcDiff <- fcO-fcP
# Table to Show the Summary:
table_diff_Up <- cbind(matched_up,fcO,fcP,fcDiff)
colnames(table_diff_Up) <- c("ProbeIDs-Mathced", "Our FC", "Paper's FC", "FC Difference")
table_diff_Up <- as.data.frame(table_diff_Up)
table_diff_Up
## ProbeIDs-Mathced Our FC Paper's FC FC Difference
## 1 215440_s_at 1.5 1.5 0
## 2 1554229_at 1.51 1.51 0
## 3 204285_s_at 1.51 1.52 -0.01
## 4 216015_s_at 1.52 1.52 0
## 5 226702_at 1.54 1.54 0
## 6 201120_s_at 1.54 1.54 0
## 7 1555464_at 1.55 1.55 0
## 8 203574_at 1.55 1.55 0
## 9 209457_at 1.56 1.56 0
## 10 224102_at 1.56 1.56 0
## 11 211555_s_at 1.58 1.57 0.01
## 12 225557_at 1.58 1.58 0
## 13 206116_s_at 1.58 1.59 -0.01
## 14 211919_s_at 1.59 1.59 0
## 15 210845_s_at 1.6 1.6 0
## 16 205214_at 1.6 1.6 0
## 17 207387_s_at 1.61 1.61 0
## 18 220266_s_at 1.61 1.61 0
## 19 202284_s_at 1.62 1.62 0
## 20 200730_s_at 1.62 1.62 0
## 21 210190_at 1.66 1.65 0.01
## 22 208524_at 1.66 1.66 0
## 23 1552542_s_at 1.66 1.66 0
## 24 204751_x_at 1.68 1.68 0
## 25 209189_at 1.7 1.7 0
## 26 211307_s_at 1.71 1.72 -0.01
## 27 1569599_at 1.72 1.72 0
## 28 205681_at 1.73 1.73 0
## 29 217738_at 1.73 1.73 0
## 30 211924_s_at 1.76 1.76 0
## 31 217739_s_at 1.8 1.8 0
## 32 221060_s_at 1.83 1.82 0.01
## 33 208792_s_at 1.83 1.83 0
## 34 208791_at 1.84 1.84 0
## 35 37028_at 1.87 1.87 0
## 36 208960_s_at 1.88 1.88 0
## 37 241133_at 1.91 1.91 0
## 38 201044_x_at 1.94 1.94 0
## 39 202014_at 1.96 1.96 0
## 40 207674_at 1.98 1.99 -0.01
## 41 226817_at 1.99 1.99 0
## 42 243296_at 2 2 0
## 43 235086_at 2.11 2.12 -0.0100000000000002
## 44 221345_at 2.34 2.34 0
## 45 1554997_a_at 2.77 2.77 0
## 46 211506_s_at 3.69 3.69 0
## 47 213524_s_at 3.9 3.9 0
# AGAIN!
# We can see the FC Difference is nearly 0 => our Data so far matches the Paper's Data
# If we Correct for the Annotation, we will get identical Data.
# The Paper Maps the Probe IDs Manually through GEO and other Tools.
#-------------------
#=================================
# ~~ Step 7: Finding GO terms associated with differentially expressed genes
# For up-regulated Genes:
DE_genes <- f_up_reg[,1]
back_genes_idx <- genefilter::genefinder(final_filtered,
as.character(DE_genes),
method = "manhattan", scale = "none")
back_genes_idx <- sapply(back_genes_idx, function(x)x$indices)
back_genes <- featureNames(final_filtered)[back_genes_idx]
back_genes <- setdiff(back_genes, DE_genes)
length(back_genes)
## [1] 8
# # Running TOPGO ANALYSIS: (AS THE PIPELINE STUDIED IN THE COURSE)
# gene_IDs <- rownames(Biobase::exprs(cal_filtered))
# in_universe <- gene_IDs %in% c(DE_genes, back_genes)
# in_selection <- gene_IDs %in% DE_genes
#
# all_genes <- in_selection[in_universe]
# all_genes <- factor(as.integer(in_selection[in_universe]))
# names(all_genes) <- gene_IDs[in_universe]
#
# top_GO_data <- new("topGOdata", ontology = "BP", allGenes = all_genes,
# nodeSize = 10, annot = annFUN.db, affyLib = "hgu133plus2.db")
# OUTPUT: Building most specific GOs .....
# ERROR: SQL statements must be issued with dbExecute() or dbSendStatement() instead of dbGetQuery() or dbSendQuery().Error: unable to open database file
# Tried Fixes: Unloading & re-loading
# Reinstalling Package with dependencies
# Checked internet Connection
# Installed package as zip file into directory
# ================ none worked
# result_top_GO_elim <-
# runTest(top_GO_data, algorithm = "elim", statistic = "Fisher")
# result_top_GO_classic <-
# runTest(top_GO_data, algorithm = "classic", statistic = "Fisher")
#
# res_top_GO <- GenTable(top_GO_data, Fisher.elim = result_top_GO_elim,
# Fisher.classic = result_top_GO_classic,
# orderBy = "Fisher.elim" , topNodes = 100)
#
# genes_top_GO <- printGenes(top_GO_data, whichTerms = res_top_GO$GO.ID,
# chip = "hgu133plus2.db", geneCutOff = 1000)
#
# res_top_GO$sig_genes <- sapply(genes_top_GO, function(x){
# str_c(paste0(x[x$'raw p-value' == 2, "Symbol.id"],";"),
# collapse = "")
# })
# #=================================
# # FOR DOWNREGULATED GENES:
# DE_genes <- f_down_reg[,1]
# back_genes_idx <- genefilter::genefinder(final_filtered,
# as.character(DE_genes),
# method = "manhattan", scale = "none")
# back_genes_idx <- sapply(back_genes_idx, function(x)x$indices)
#
# back_genes <- featureNames(final_filtered)[back_genes_idx]
# back_genes <- setdiff(back_genes, DE_genes)
#
# length(back_genes)
#
# # Running TOPGO ANALYSIS: (AS THE PIPELINE STUDIED IN THE COURSE)
# gene_IDs <- rownames(Biobase::exprs(cal_filtered))
# in_universe <- gene_IDs %in% c(DE_genes, back_genes)
# in_selection <- gene_IDs %in% DE_genes
#
# all_genes <- in_selection[in_universe]
# all_genes <- factor(as.integer(in_selection[in_universe]))
# names(all_genes) <- gene_IDs[in_universe]
#
# top_GO_data <- new("topGOdata", ontology = "BP", allGenes = all_genes,
# nodeSize = 10, annot = annFUN.db, affyLib = "hgu133plus2.db")
# result_top_GO_elim <-
# runTest(top_GO_data, algorithm = "elim", statistic = "Fisher")
# result_top_GO_classic <-
# runTest(top_GO_data, algorithm = "classic", statistic = "Fisher")
#
# res_top_GO <- GenTable(top_GO_data, Fisher.elim = result_top_GO_elim,
# Fisher.classic = result_top_GO_classic,
# orderBy = "Fisher.elim" , topNodes = 100)
#
# genes_top_GO <- printGenes(top_GO_data, whichTerms = res_top_GO$GO.ID,
# chip = "hgu133plus2.db", geneCutOff = 1000)
#
# res_top_GO$sig_genes <- sapply(genes_top_GO, function(x){
# str_c(paste0(x[x$'raw p-value' == 2, "Symbol.id"],";"),
# collapse = "")
# })