Co-expression network analysis

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(WGCNA)

## Loading required package: dynamicTreeCut
## Loading required package: fastcluster
## 
## Attaching package: 'fastcluster'
## 
## The following object is masked from 'package:stats':
## 
##     hclust
## 
## 
## 
## Attaching package: 'WGCNA'
## 
## The following object is masked from 'package:stats':
## 
##     cor

options(stringsAsFactors = FALSE)

HTAdata <- read.csv("/Users/lorandacalderonzamora/Respaldo/LORANDA/microarreglo/Conjuto de datos/Hipertensión/GSE24752/datExprHTA.csv")
dim(HTAdata)

## [1] 13364     7

names(HTAdata)

## [1] "X"         "GSM609525" "GSM609526" "GSM609527" "GSM609528" "GSM609529"
## [7] "GSM609530"

rownames(HTAdata) <- HTAdata$X
HTAdata <- HTAdata[, -1]
datExpr <- as.data.frame(t(HTAdata))

dim(datExpr)

## [1]     6 13364

gsg = goodSamplesGenes(datExpr, verbose = 3);

##  Flagging genes and samples with too many missing values...
##   ..step 1

gsg$allOK

## [1] TRUE

sampleTree = hclust(dist(datExpr), method = "average");
sizeGrWindow(12,9)
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, 
     cex.axis = 1.5, cex.main = 2)

traitData <- read.csv("/Users/lorandacalderonzamora/Respaldo/LORANDA/microarreglo/Conjuto de datos/Hipertensión/GSE24752/datMeta.csv")
dim(traitData )

## [1] 6 4

names(traitData )

## [1] "geo_accession" "Disease"       "Age"           "Gender"

sampleNames <- rownames(datExpr)
head(traitData$geo_accession)

## [1] "GSM609525" "GSM609526" "GSM609527" "GSM609528" "GSM609529" "GSM609530"

all(sampleNames %in% traitData$geo_accession)

## [1] TRUE

traitRows <- match(sampleNames, traitData$geo_accession)
datTraits <- traitData[traitRows, ]
rownames(datTraits) <- datTraits$geo_accession

datTraits[] <- lapply(datTraits, function(x) {
  if (is.character(x) || is.factor(x)) {
    as.numeric(as.factor(x))
  } else {
    x  
  }
})
datTraits_clean <- datTraits[, sapply(datTraits, is.numeric)]

str(datTraits_clean)

## 'data.frame':    6 obs. of  4 variables:
##  $ geo_accession: num  1 2 3 4 5 6
##  $ Disease      : num  2 2 2 1 1 1
##  $ Age          : int  47 49 56 44 54 38
##  $ Gender       : num  2 2 1 2 2 1

sampleTree2 <- hclust(dist(datExpr), method = "average")
traitColors <- numbers2colors(datTraits_clean, signed = FALSE)
plotDendroAndColors(sampleTree2, traitColors,
                    groupLabels = names(datTraits_clean),
                    main = "Sample dendrogram and trait heatmap")

save(datExpr, datTraits_clean, file = "HTAdataInput.RData")

lnames = load(file = "HTAdataInput.RData");
#The variable lnames contains the names of loaded variables.
lnames

## [1] "datExpr"         "datTraits_clean"

# Filter the least variable genes (optional)
variance_genes <- apply(datExpr, 2, var)
datExpr_filtered <- datExpr[, variance_genes > quantile(variance_genes, 0.75)]

# Select a narrower range of powers due to few samples
powers = c(1:6)

sft = pickSoftThreshold(datExpr_filtered, powerVector = powers, verbose = 5)

## pickSoftThreshold: will use block size 3341.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 3341 of 3341

## Warning: executing %dopar% sequentially: no parallel backend registered

##   Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1     1   0.8510 2.3700         0.8590    2070      2250   2560
## 2     2   0.8060 1.0800         0.8640    1530      1700   2150
## 3     3   0.6970 0.6110         0.7990    1210      1350   1880
## 4     4   0.5840 0.3700         0.7000     997      1110   1670
## 5     5   0.3000 0.1960         0.4060     842       930   1500
## 6     6   0.0705 0.0805         0.0312     726       791   1360

sizeGrWindow(9, 5)
par(mfrow = c(1, 2))
cex1 = 0.9

plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3]) * sft$fitIndices[,2],
     xlab = "Soft Threshold (power)", 
     ylab = "Scale Free Topology Model Fit, signed R^2", type = "n",
     main = "Scale independence")
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3]) * sft$fitIndices[,2], 
     labels = powers, cex = cex1, col = "red")
abline(h = 0.80, col = "red")  # Ajuste del umbral a 0.80

plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab = "Soft Threshold (power)", 
     ylab = "Mean Connectivity", type = "n",
     main = "Mean connectivity")
text(sft$fitIndices[,1], sft$fitIndices[,5], labels = powers, cex = cex1, col = "red")

net = blockwiseModules(
  datExpr, 
  power = 6,  
  TOMType = "unsigned",  
  minModuleSize = 200,  
  reassignThreshold = 0, 
  mergeCutHeight = 0.20,  
  numericLabels = TRUE, 
  pamRespectsDendro = FALSE,  
  saveTOMs = TRUE, 
  saveTOMFileBase = "TOM_diabetes",  
  verbose = 3
)

##  Calculating module eigengenes block-wise from all genes
##    Flagging genes and samples with too many missing values...
##     ..step 1
##  ....pre-clustering genes to determine blocks..
##    Projective K-means:
##    ..k-means clustering..
##    ..merging smaller clusters...
## Block sizes:
## gBlocks
##    1    2    3 
## 4791 4647 3926 
##  ..Working on block 1 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 1 into file TOM_diabetes-block.1.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##  ..Working on block 2 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 2 into file TOM_diabetes-block.2.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##      ..removing 1 genes from module 2 because their KME is too low.
##  ..Working on block 3 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 3 into file TOM_diabetes-block.3.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##  ..merging modules that are too close..
##      mergeCloseModules: Merging modules whose distance is less than 0.2
##        Calculating new MEs...

sizeGrWindow(12, 9)

mergedColors = labels2colors(net$colors)

for (i in 1:length(net$dendrograms)) {
  plotDendroAndColors(
    net$dendrograms[[i]],            
    mergedColors[net$blockGenes[[i]]],  
    "Module colors",                  
    dendroLabels = FALSE, 
    hang = 0.03,                      
    addGuide = TRUE, guideHang = 0.05 
  )
}

table(net$colors)  # Gene counts per module

## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##    1 2786 2145 1699 1352 1000  692  492  465  365  326  320  307  279  260  240 
##   16   17   18 
##  220  211  204

MEs = net$MEs  # # Module Eigengene

moduleTraitCor = cor(MEs, datTraits_clean, use = "pairwise.complete.obs")

moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))

print(moduleTraitCor)

##      geo_accession     Disease         Age      Gender
## ME15   0.319732937 -0.56693177 -0.03841643  0.68801756
## ME10  -0.399391716  0.18315991 -0.36676443  0.56305811
## ME4   -0.363309341  0.01870732 -0.21841128  0.84468787
## ME18  -0.167425047 -0.31489661 -0.32684184  0.84747709
## ME3   -0.660037089  0.44477234 -0.47913521  0.34070876
## ME5   -0.351463972 -0.01016085 -0.47573845  0.50660779
## ME9    0.414624763 -0.54451781 -0.47018969  0.02518454
## ME1    0.784065848 -0.40760334  0.04609465 -0.79163832
## ME2    0.663148275 -0.44966731  0.19969097 -0.41272112
## ME6    0.095951713  0.09078063 -0.20557149 -0.61005783
## ME8    0.489225672 -0.51141806 -0.31829577 -0.33796883
## ME11   0.111514996  0.06737301  0.28901354 -0.62630268
## ME13  -0.270968006  0.11539459  0.27003195  0.01436793
## ME12  -0.354988176  0.07269213  0.22655183  0.31819839
## ME7   -0.209452973  0.33380432  0.77876119 -0.14911383
## ME17   0.274043090 -0.31169959  0.46719221 -0.10513017
## ME14  -0.515679665  0.32417830  0.81673383  0.62832587
## ME16  -0.899899738  0.85148594  0.47374788  0.24303802
## ME0   -0.004063263 -0.32637382  0.38434291  0.68563470

print(moduleTraitPvalue)

##      geo_accession    Disease        Age     Gender
## ME15    0.53674361 0.24071157 0.94240371 0.13081647
## ME10    0.43276666 0.72833241 0.47452122 0.24466724
## ME4     0.47901326 0.97194230 0.67759257 0.03430958
## ME18    0.75120899 0.54326764 0.52719477 0.03312077
## ME3     0.15371660 0.37683446 0.33629485 0.50871201
## ME5     0.49451167 0.98475925 0.34022857 0.30509913
## ME9     0.41370269 0.26394795 0.34668985 0.96223117
## ME1     0.06490710 0.42245470 0.93090700 0.06059892
## ME2     0.15109250 0.37096056 0.70444504 0.41606952
## ME6     0.85651413 0.86420312 0.69598645 0.19843603
## ME8     0.32470756 0.29975321 0.53867996 0.51234865
## ME11    0.83342088 0.89909339 0.57855017 0.18338117
## ME13    0.60349572 0.82767640 0.60479706 0.97844958
## ME12    0.48988494 0.89115386 0.66598622 0.53881125
## ME7     0.69041495 0.51789064 0.06800547 0.77798702
## ME17    0.59922563 0.54759245 0.35019837 0.84288571
## ME14    0.29504669 0.53076675 0.04730210 0.18154064
## ME16    0.01452859 0.03144679 0.34254147 0.64262080
## ME0     0.99390514 0.52782191 0.45187310 0.13270465

# Extract the correlations and p-values for the trait "Disease"
cor_Disease = moduleTraitCor[, "Disease"]
pval_Disease = moduleTraitPvalue[, "Disease"]

best_module = which.max(abs(cor_Disease))

print(paste("Most significant module:", names(cor_Disease)[best_module]))

## [1] "Most significant module: ME16"

print(paste("Correlation:", cor_Disease[best_module]))

## [1] "Correlation: 0.851485940890004"

print(paste("P-value:", pval_Disease[best_module]))

## [1] "P-value: 0.0314467939709487"

module16_genes = names(datExpr)[net$colors == 16]
head(module16_genes)

## [1] "ENSG00000003436" "ENSG00000004399" "ENSG00000005007" "ENSG00000005238"
## [5] "ENSG00000005961" "ENSG00000007520"

str(module16_genes)

##  chr [1:220] "ENSG00000003436" "ENSG00000004399" "ENSG00000005007" ...

# PPI network analysis
library(STRINGdb)
string_db <- STRINGdb$new(version = "11.5", species = 9606, 
                          score_threshold = 400, input_directory = "")

# Mapping Module 16 genes and visualizing Protein-Protein Interaction Network
module16_genes <- gsub("\\..*", "", module16_genes)
df_genes <- data.frame(ENSEMBL = module16_genes)
mapped_genes <- string_db$map(df_genes, "ENSEMBL", removeUnmappedRows = TRUE)

## Warning:  we couldn't map to STRING 16% of your identifiers

print(head(mapped_genes))

##           ENSEMBL            STRING_id
## 1 ENSG00000003436 9606.ENSP00000233156
## 2 ENSG00000004399 9606.ENSP00000317128
## 3 ENSG00000005007 9606.ENSP00000470142
## 4 ENSG00000005238 9606.ENSP00000367823
## 5 ENSG00000005961 9606.ENSP00000262407
## 6 ENSG00000007520 9606.ENSP00000007390

unmapped_genes <- setdiff(df_genes$ENSEMBL, mapped_genes$ENSEMBL)
print(unmapped_genes)

##  [1] "ENSG00000174572" "ENSG00000213600" "ENSG00000214297" "ENSG00000216718"
##  [5] "ENSG00000219491" "ENSG00000225312" "ENSG00000227969" "ENSG00000232176"
##  [9] "ENSG00000233111" "ENSG00000235175" "ENSG00000237887" "ENSG00000242571"
## [13] "ENSG00000244535" "ENSG00000259291" "ENSG00000260306" "ENSG00000268173"
## [17] "ENSG00000273993" "ENSG00000275544" "ENSG00000276068" "ENSG00000277276"
## [21] "ENSG00000281380" "ENSG00000284796" "ENSG00000284980" "ENSG00000285049"
## [25] "ENSG00000285426" "ENSG00000285602" "ENSG00000286522" "ENSG00000287978"
## [29] "ENSG00000291003" "ENSG00000291480" "ENSG00000291583" "ENSG00000291971"
## [33] "ENSG00000292344" "ENSG00000292354" "ENSG00000301761" "ENSG00000307722"
## [37] "ENSG00000310469"

string_db$plot_network(mapped_genes$STRING_id)

library(org.Hs.eg.db)

## Loading required package: AnnotationDbi

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union

## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: IRanges

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:lubridate':
## 
##     second, second<-

## The following objects are masked from 'package:dplyr':
## 
##     first, rename

## The following object is masked from 'package:tidyr':
## 
##     expand

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:lubridate':
## 
##     %within%

## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice

## The following object is masked from 'package:purrr':
## 
##     reduce

## 
## Attaching package: 'AnnotationDbi'

## The following object is masked from 'package:dplyr':
## 
##     select

##

# Functional enrichment analysis
enrichment_results_GO <- string_db$get_enrichment(
  string_ids = mapped_genes$STRING_id,  
  category = "Process",  # GO "Process" or "Pathway" by KEGG
  methodMT = "fdr",  
  iea = FALSE  
)

## Warning in string_db$get_enrichment(string_ids = mapped_genes$STRING_id, :
## methodMT parameter is depecated. Only FDR correction is available.

## Warning in string_db$get_enrichment(string_ids = mapped_genes$STRING_id, : iea
## parameter is deprecated.

head(enrichment_results_GO)

##  [1] category                      term                         
##  [3] number_of_genes               number_of_genes_in_background
##  [5] ncbiTaxonId                   inputGenes                   
##  [7] preferredNames                p_value                      
##  [9] fdr                           description                  
## <0 rows> (or 0-length row.names)

write.csv(enrichment_results_GO, "enrichment_results_GO_module16.csv", row.names = FALSE)

Co-expression network analysis

Loranda_Calderon

2024-10-11