library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(WGCNA)
## Loading required package: dynamicTreeCut
## Loading required package: fastcluster
##
## Attaching package: 'fastcluster'
##
## The following object is masked from 'package:stats':
##
## hclust
##
##
##
## Attaching package: 'WGCNA'
##
## The following object is masked from 'package:stats':
##
## cor
options(stringsAsFactors = FALSE)
HTAdata <- read.csv("/Users/lorandacalderonzamora/Respaldo/LORANDA/microarreglo/Conjuto de datos/Hipertensión/GSE24752/datExprHTA.csv")
dim(HTAdata)
## [1] 13364 7
names(HTAdata)
## [1] "X" "GSM609525" "GSM609526" "GSM609527" "GSM609528" "GSM609529"
## [7] "GSM609530"
rownames(HTAdata) <- HTAdata$X
HTAdata <- HTAdata[, -1]
datExpr <- as.data.frame(t(HTAdata))
dim(datExpr)
## [1] 6 13364
gsg = goodSamplesGenes(datExpr, verbose = 3);
## Flagging genes and samples with too many missing values...
## ..step 1
gsg$allOK
## [1] TRUE
sampleTree = hclust(dist(datExpr), method = "average");
sizeGrWindow(12,9)
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
cex.axis = 1.5, cex.main = 2)
traitData <- read.csv("/Users/lorandacalderonzamora/Respaldo/LORANDA/microarreglo/Conjuto de datos/Hipertensión/GSE24752/datMeta.csv")
dim(traitData )
## [1] 6 4
names(traitData )
## [1] "geo_accession" "Disease" "Age" "Gender"
sampleNames <- rownames(datExpr)
head(traitData$geo_accession)
## [1] "GSM609525" "GSM609526" "GSM609527" "GSM609528" "GSM609529" "GSM609530"
all(sampleNames %in% traitData$geo_accession)
## [1] TRUE
traitRows <- match(sampleNames, traitData$geo_accession)
datTraits <- traitData[traitRows, ]
rownames(datTraits) <- datTraits$geo_accession
datTraits[] <- lapply(datTraits, function(x) {
if (is.character(x) || is.factor(x)) {
as.numeric(as.factor(x))
} else {
x
}
})
datTraits_clean <- datTraits[, sapply(datTraits, is.numeric)]
str(datTraits_clean)
## 'data.frame': 6 obs. of 4 variables:
## $ geo_accession: num 1 2 3 4 5 6
## $ Disease : num 2 2 2 1 1 1
## $ Age : int 47 49 56 44 54 38
## $ Gender : num 2 2 1 2 2 1
sampleTree2 <- hclust(dist(datExpr), method = "average")
traitColors <- numbers2colors(datTraits_clean, signed = FALSE)
plotDendroAndColors(sampleTree2, traitColors,
groupLabels = names(datTraits_clean),
main = "Sample dendrogram and trait heatmap")

save(datExpr, datTraits_clean, file = "HTAdataInput.RData")
lnames = load(file = "HTAdataInput.RData");
#The variable lnames contains the names of loaded variables.
lnames
## [1] "datExpr" "datTraits_clean"
# Filter the least variable genes (optional)
variance_genes <- apply(datExpr, 2, var)
datExpr_filtered <- datExpr[, variance_genes > quantile(variance_genes, 0.75)]
# Select a narrower range of powers due to few samples
powers = c(1:6)
sft = pickSoftThreshold(datExpr_filtered, powerVector = powers, verbose = 5)
## pickSoftThreshold: will use block size 3341.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 3341 of 3341
## Warning: executing %dopar% sequentially: no parallel backend registered
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.8510 2.3700 0.8590 2070 2250 2560
## 2 2 0.8060 1.0800 0.8640 1530 1700 2150
## 3 3 0.6970 0.6110 0.7990 1210 1350 1880
## 4 4 0.5840 0.3700 0.7000 997 1110 1670
## 5 5 0.3000 0.1960 0.4060 842 930 1500
## 6 6 0.0705 0.0805 0.0312 726 791 1360
sizeGrWindow(9, 5)
par(mfrow = c(1, 2))
cex1 = 0.9
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3]) * sft$fitIndices[,2],
xlab = "Soft Threshold (power)",
ylab = "Scale Free Topology Model Fit, signed R^2", type = "n",
main = "Scale independence")
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3]) * sft$fitIndices[,2],
labels = powers, cex = cex1, col = "red")
abline(h = 0.80, col = "red") # Ajuste del umbral a 0.80
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab = "Soft Threshold (power)",
ylab = "Mean Connectivity", type = "n",
main = "Mean connectivity")
text(sft$fitIndices[,1], sft$fitIndices[,5], labels = powers, cex = cex1, col = "red")
net = blockwiseModules(
datExpr,
power = 6,
TOMType = "unsigned",
minModuleSize = 200,
reassignThreshold = 0,
mergeCutHeight = 0.20,
numericLabels = TRUE,
pamRespectsDendro = FALSE,
saveTOMs = TRUE,
saveTOMFileBase = "TOM_diabetes",
verbose = 3
)
## Calculating module eigengenes block-wise from all genes
## Flagging genes and samples with too many missing values...
## ..step 1
## ....pre-clustering genes to determine blocks..
## Projective K-means:
## ..k-means clustering..
## ..merging smaller clusters...
## Block sizes:
## gBlocks
## 1 2 3
## 4791 4647 3926
## ..Working on block 1 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 1 into file TOM_diabetes-block.1.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..Working on block 2 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 2 into file TOM_diabetes-block.2.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 1 genes from module 2 because their KME is too low.
## ..Working on block 3 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 3 into file TOM_diabetes-block.3.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..merging modules that are too close..
## mergeCloseModules: Merging modules whose distance is less than 0.2
## Calculating new MEs...
sizeGrWindow(12, 9)
mergedColors = labels2colors(net$colors)
for (i in 1:length(net$dendrograms)) {
plotDendroAndColors(
net$dendrograms[[i]],
mergedColors[net$blockGenes[[i]]],
"Module colors",
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE, guideHang = 0.05
)
}
table(net$colors) # Gene counts per module
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 1 2786 2145 1699 1352 1000 692 492 465 365 326 320 307 279 260 240
## 16 17 18
## 220 211 204
MEs = net$MEs # # Module Eigengene
moduleTraitCor = cor(MEs, datTraits_clean, use = "pairwise.complete.obs")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))
print(moduleTraitCor)
## geo_accession Disease Age Gender
## ME15 0.319732937 -0.56693177 -0.03841643 0.68801756
## ME10 -0.399391716 0.18315991 -0.36676443 0.56305811
## ME4 -0.363309341 0.01870732 -0.21841128 0.84468787
## ME18 -0.167425047 -0.31489661 -0.32684184 0.84747709
## ME3 -0.660037089 0.44477234 -0.47913521 0.34070876
## ME5 -0.351463972 -0.01016085 -0.47573845 0.50660779
## ME9 0.414624763 -0.54451781 -0.47018969 0.02518454
## ME1 0.784065848 -0.40760334 0.04609465 -0.79163832
## ME2 0.663148275 -0.44966731 0.19969097 -0.41272112
## ME6 0.095951713 0.09078063 -0.20557149 -0.61005783
## ME8 0.489225672 -0.51141806 -0.31829577 -0.33796883
## ME11 0.111514996 0.06737301 0.28901354 -0.62630268
## ME13 -0.270968006 0.11539459 0.27003195 0.01436793
## ME12 -0.354988176 0.07269213 0.22655183 0.31819839
## ME7 -0.209452973 0.33380432 0.77876119 -0.14911383
## ME17 0.274043090 -0.31169959 0.46719221 -0.10513017
## ME14 -0.515679665 0.32417830 0.81673383 0.62832587
## ME16 -0.899899738 0.85148594 0.47374788 0.24303802
## ME0 -0.004063263 -0.32637382 0.38434291 0.68563470
print(moduleTraitPvalue)
## geo_accession Disease Age Gender
## ME15 0.53674361 0.24071157 0.94240371 0.13081647
## ME10 0.43276666 0.72833241 0.47452122 0.24466724
## ME4 0.47901326 0.97194230 0.67759257 0.03430958
## ME18 0.75120899 0.54326764 0.52719477 0.03312077
## ME3 0.15371660 0.37683446 0.33629485 0.50871201
## ME5 0.49451167 0.98475925 0.34022857 0.30509913
## ME9 0.41370269 0.26394795 0.34668985 0.96223117
## ME1 0.06490710 0.42245470 0.93090700 0.06059892
## ME2 0.15109250 0.37096056 0.70444504 0.41606952
## ME6 0.85651413 0.86420312 0.69598645 0.19843603
## ME8 0.32470756 0.29975321 0.53867996 0.51234865
## ME11 0.83342088 0.89909339 0.57855017 0.18338117
## ME13 0.60349572 0.82767640 0.60479706 0.97844958
## ME12 0.48988494 0.89115386 0.66598622 0.53881125
## ME7 0.69041495 0.51789064 0.06800547 0.77798702
## ME17 0.59922563 0.54759245 0.35019837 0.84288571
## ME14 0.29504669 0.53076675 0.04730210 0.18154064
## ME16 0.01452859 0.03144679 0.34254147 0.64262080
## ME0 0.99390514 0.52782191 0.45187310 0.13270465
# Extract the correlations and p-values for the trait "Disease"
cor_Disease = moduleTraitCor[, "Disease"]
pval_Disease = moduleTraitPvalue[, "Disease"]
best_module = which.max(abs(cor_Disease))
print(paste("Most significant module:", names(cor_Disease)[best_module]))
## [1] "Most significant module: ME16"
print(paste("Correlation:", cor_Disease[best_module]))
## [1] "Correlation: 0.851485940890004"
print(paste("P-value:", pval_Disease[best_module]))
## [1] "P-value: 0.0314467939709487"
module16_genes = names(datExpr)[net$colors == 16]
head(module16_genes)
## [1] "ENSG00000003436" "ENSG00000004399" "ENSG00000005007" "ENSG00000005238"
## [5] "ENSG00000005961" "ENSG00000007520"
str(module16_genes)
## chr [1:220] "ENSG00000003436" "ENSG00000004399" "ENSG00000005007" ...
# PPI network analysis
library(STRINGdb)
string_db <- STRINGdb$new(version = "11.5", species = 9606,
score_threshold = 400, input_directory = "")
# Mapping Module 16 genes and visualizing Protein-Protein Interaction Network
module16_genes <- gsub("\\..*", "", module16_genes)
df_genes <- data.frame(ENSEMBL = module16_genes)
mapped_genes <- string_db$map(df_genes, "ENSEMBL", removeUnmappedRows = TRUE)
## Warning: we couldn't map to STRING 16% of your identifiers
print(head(mapped_genes))
## ENSEMBL STRING_id
## 1 ENSG00000003436 9606.ENSP00000233156
## 2 ENSG00000004399 9606.ENSP00000317128
## 3 ENSG00000005007 9606.ENSP00000470142
## 4 ENSG00000005238 9606.ENSP00000367823
## 5 ENSG00000005961 9606.ENSP00000262407
## 6 ENSG00000007520 9606.ENSP00000007390
unmapped_genes <- setdiff(df_genes$ENSEMBL, mapped_genes$ENSEMBL)
print(unmapped_genes)
## [1] "ENSG00000174572" "ENSG00000213600" "ENSG00000214297" "ENSG00000216718"
## [5] "ENSG00000219491" "ENSG00000225312" "ENSG00000227969" "ENSG00000232176"
## [9] "ENSG00000233111" "ENSG00000235175" "ENSG00000237887" "ENSG00000242571"
## [13] "ENSG00000244535" "ENSG00000259291" "ENSG00000260306" "ENSG00000268173"
## [17] "ENSG00000273993" "ENSG00000275544" "ENSG00000276068" "ENSG00000277276"
## [21] "ENSG00000281380" "ENSG00000284796" "ENSG00000284980" "ENSG00000285049"
## [25] "ENSG00000285426" "ENSG00000285602" "ENSG00000286522" "ENSG00000287978"
## [29] "ENSG00000291003" "ENSG00000291480" "ENSG00000291583" "ENSG00000291971"
## [33] "ENSG00000292344" "ENSG00000292354" "ENSG00000301761" "ENSG00000307722"
## [37] "ENSG00000310469"
string_db$plot_network(mapped_genes$STRING_id)

library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:lubridate':
##
## second, second<-
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:lubridate':
##
## %within%
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
##
# Functional enrichment analysis
enrichment_results_GO <- string_db$get_enrichment(
string_ids = mapped_genes$STRING_id,
category = "Process", # GO "Process" or "Pathway" by KEGG
methodMT = "fdr",
iea = FALSE
)
## Warning in string_db$get_enrichment(string_ids = mapped_genes$STRING_id, :
## methodMT parameter is depecated. Only FDR correction is available.
## Warning in string_db$get_enrichment(string_ids = mapped_genes$STRING_id, : iea
## parameter is deprecated.
head(enrichment_results_GO)
## [1] category term
## [3] number_of_genes number_of_genes_in_background
## [5] ncbiTaxonId inputGenes
## [7] preferredNames p_value
## [9] fdr description
## <0 rows> (or 0-length row.names)
write.csv(enrichment_results_GO, "enrichment_results_GO_module16.csv", row.names = FALSE)