library(GEOquery)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(affy)
library(limma)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## Loading required package: BiocParallel
library(WGCNA)
## Loading required package: dynamicTreeCut
## Loading required package: fastcluster
## 
## Attaching package: 'fastcluster'
## The following object is masked from 'package:stats':
## 
##     hclust
## 
## 
## Attaching package: 'WGCNA'
## The following object is masked from 'package:stats':
## 
##     cor
library(ensembldb)
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:nlme':
## 
##     collapse
## Loading required package: GenomeInfoDb
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## Loading required package: AnnotationFilter
## 
## Attaching package: 'ensembldb'
## The following object is masked from 'package:stats':
## 
##     filter
library(biomaRt)
# Load a Normalized_expression_matrix_from_Wistar-STZ and SHR csv files
data_GSE4745 <- read.csv("/Users/lorandacalderonzamora/Downloads/CRdata4745.csv", row.names = 1)
data_GSE194325 <- read.csv("/Users/lorandacalderonzamora/Downloads/CRdata194325.csv", row.names = 1)
# Preprocessing sample identifiers and group assignment
colnames(data_GSE4745) <- gsub("\\..*", "", colnames(data_GSE4745)) 
group <- factor(colnames(data_GSE4745))

colnames(data_GSE194325) <- gsub("\\..*", "", colnames(data_GSE194325)) 
group <- factor(colnames(data_GSE194325))
# Merging Wistar-STZ Expression matrices by common genes
common_genes <- Reduce(intersect, list(
  rownames(data_GSE4745),
  rownames(data_GSE194325)))

data_GSE4745_common <- data_GSE4745[common_genes, ]
data_GSE194325_common <- data_GSE194325[common_genes, ]

unificated_expr_matrix <- cbind(data_GSE4745_common, 
                               data_GSE194325_common)

unificated_expr_matrix <- as.data.frame(unificated_expr_matrix)
# Relabeling sample identifiers with group labels for Differential Expression analysis
sample_ids <- colnames(unificated_expr_matrix)

group_labels <- sample_ids
group_labels[group_labels %in% c("GSM5833511", 
                                 "GSM5833512", 
                                 "GSM5833513",
                                 "GSM107416", 
                                 "GSM107417", 
                                 "GSM107418",
                                 "GSM107419", 
                                 "GSM107424", 
                                 "GSM107425", 
                                 "GSM107426", 
                                 "GSM107427", 
                                 "GSM107432", 
                                 "GSM107433", 
                                 "GSM107434", 
                                 "GSM107435")] <- "CTL"

group_labels[group_labels %in% c("GSM5833508", 
                                 "GSM5833509", 
                                 "GSM5833510",
                                 "GSM107412", 
                                 "GSM107413", 
                                 "GSM107414",
                                 "GSM107415", 
                                 "GSM107420", 
                                 "GSM107421", 
                                 "GSM107422", 
                                 "GSM107423", 
                                 "GSM107428", 
                                 "GSM107429", 
                                 "GSM107430",
                                 "GSM107431")] <- "Disease"

colnames(unificated_expr_matrix) <- group_labels
# Boxplot of the merged Expression matrix from unificated_expr_matrix prior to Batch correction
boxplot(unificated_expr_matrix, 
        main = "Gene Expression distribution before Batch correction", 
        col = c("red", "green")[as.numeric(factor(group_labels))], 
        las = 2, ylab = "Normalized Expression (log2 intensity)")
legend("topright", legend = levels(factor(group_labels)), fill = c("red", "green"))

# Quantile normalization of merged Expression matrix
library(preprocessCore)
expr_matrix_qn <- normalize.quantiles(as.matrix(unificated_expr_matrix))
rownames(expr_matrix_qn) <- rownames(unificated_expr_matrix)
colnames(expr_matrix_qn) <- colnames(unificated_expr_matrix)
# Batch effect correction between unificated_expr_matrix Datasets using ComBat
n_4745 <- ncol(data_GSE4745_common)

n_194325 <- ncol(data_GSE194325_common)
batch <- c(rep("GSE4745", n_4745), rep("GSE194325", n_194325))

combat_expr <- ComBat(dat = expr_matrix_qn, batch = batch, par.prior = TRUE, prior.plots = FALSE)
## Found 4 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
# Boxplot of the merged Expression matrix from Wistar-STZ and SHR after to Batch correction
boxplot(expr_matrix_qn, 
        main = "Gene Expression distribution unificated matriz Wistar-STZ and SHR after Batch correction", 
        col = c("red", "green")[as.numeric(factor(group_labels))], 
        las = 2, ylab = "Normalized Expression (log2 intensity)", cex.main = 1)
legend("topright", legend = levels(factor(group_labels)), fill = c("red", "green"))

# Define group factor for Differential Expression analysis
sample_ids <- colnames(combat_expr)
group <- as.factor(colnames(combat_expr))

`

# Differential Expression Analysis between Control and Disease samples from integrated Wistar-STZ and SHR Datasets
design <- model.matrix(~ group)
fit_merge <- lmFit(combat_expr, design)
fit_merge <- eBayes(fit_merge)
tt_merge <- topTable(fit_merge, coef = 2, number = Inf)
head(tt_merge)
##                         logFC   AveExpr         t      P.Value    adj.P.Val
## ENSRNOG00000021201 -0.6760251 10.469248 -8.893688 5.804914e-10 2.781134e-06
## ENSRNOG00000046889 -0.5650384  9.959989 -8.387019 2.092478e-09 3.743838e-06
## ENSRNOG00000018505 -0.5274553  8.837082 -8.342741 2.344294e-09 3.743838e-06
## ENSRNOG00000020308 -0.6863521 11.058115 -8.105058 4.333082e-09 5.189949e-06
## ENSRNOG00000017226  0.8005053  8.807031  7.914978 7.118410e-09 6.820861e-06
## ENSRNOG00000009421  0.4355431  9.692999  7.606356 1.608975e-08 1.268900e-05
##                            B
## ENSRNOG00000021201 12.658844
## ENSRNOG00000046889 11.463382
## ENSRNOG00000018505 11.357111
## ENSRNOG00000020308 10.781767
## ENSRNOG00000017226 10.315829
## ENSRNOG00000009421  9.548571
# Gene annotation of Differential Expression using Ensembl gene IDs
ensembl <- useEnsembl(biomart = "ensembl", dataset = "rnorvegicus_gene_ensembl")
gene_ids <- rownames(tt_merge) 
annot_attributes <- c("ensembl_gene_id", "external_gene_name", "entrezgene_id", "transcript_biotype")
geneDat <- getBM(attributes = annot_attributes,
                 filters = "ensembl_gene_id",
                 values = gene_ids,
                 mart = ensembl)

tt_merge$ensembl_gene_id <- rownames(tt_merge)
tt_annotated <- merge(tt_merge, geneDat, by = "ensembl_gene_id")

tt_annotated <- tt_annotated[, c("ensembl_gene_id", "external_gene_name", 
                                 "entrezgene_id", "transcript_biotype",
                                 "logFC", "AveExpr", "t", "P.Value", 
                                 "adj.P.Val", "B")]

head(tt_annotated)
##      ensembl_gene_id external_gene_name entrezgene_id transcript_biotype
## 1 ENSRNOG00000000007               Gad1         24379     protein_coding
## 2 ENSRNOG00000000041            Slc26a1         64076     protein_coding
## 3 ENSRNOG00000000047               Cd82         83628     protein_coding
## 4 ENSRNOG00000000048                Gak         81659     protein_coding
## 5 ENSRNOG00000000053                Crp         25419     protein_coding
## 6 ENSRNOG00000000055              Fcrl6        305103     protein_coding
##          logFC  AveExpr           t    P.Value adj.P.Val         B
## 1  0.009507472 5.315268  0.12671421 0.90000113 0.9609774 -6.400702
## 2 -0.027880566 6.698368 -0.38747838 0.70110216 0.8608653 -6.333485
## 3 -0.008929563 7.227801 -0.10794539 0.91474842 0.9678798 -6.402915
## 4 -0.133612599 7.238650 -1.88683738 0.06876627 0.3081938 -4.718885
## 5  0.003304730 4.866234  0.04642736 0.96327333 0.9871749 -6.407689
## 6  0.013249601 5.662999  0.21023844 0.83488455 0.9265292 -6.386568