library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(WGCNA)
## Loading required package: dynamicTreeCut
## Loading required package: fastcluster
## 
## Attaching package: 'fastcluster'
## 
## The following object is masked from 'package:stats':
## 
##     hclust
## 
## 
## 
## Attaching package: 'WGCNA'
## 
## The following object is masked from 'package:stats':
## 
##     cor
options(stringsAsFactors = FALSE)
HTGdata <- read.csv("/Users/lorandacalderonzamora/GSE45516/CEL/datExprHTG.csv")
dim(HTGdata)
## [1] 23663    10
names(HTGdata)
##  [1] "Gene_ID"    "GSM1106065" "GSM1106066" "GSM1106067" "GSM1106068"
##  [6] "GSM1106069" "GSM1106070" "GSM1106071" "GSM1106072" "GSM1106073"
rownames(HTGdata) <- HTGdata$X
HTGdata<- HTGdata[, -1]
datExpr <- as.data.frame(t(HTGdata))
dim(datExpr)
## [1]     9 23663
gsg = goodSamplesGenes(datExpr, verbose = 3);
##  Flagging genes and samples with too many missing values...
##   ..step 1
gsg$allOK
## [1] TRUE
sampleTree = hclust(dist(datExpr), method = "average");
sizeGrWindow(12,9)
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, 
     cex.axis = 1.5, cex.main = 2)
traitData <- read.csv("/Users/lorandacalderonzamora/GSE45516/CEL/datMeta.csv")
dim(traitData )
## [1] 9 4
names(traitData )
## [1] "geo_accession" "Disease"       "Age"           "Gender"
sampleNames <- trimws(rownames(datExpr))
traitData$geo_accession <- trimws(traitData$geo_accession)
sampleNames <- rownames(datExpr)
head(traitData$geo_accession)
## [1] "GSM1106065" "GSM1106066" "GSM1106067" "GSM1106068" "GSM1106069"
## [6] "GSM1106070"
all(sampleNames %in% traitData$geo_accession)
## [1] TRUE
traitRows <- match(sampleNames, traitData$geo_accession)
datTraits <- traitData[traitRows, ]
rownames(datTraits) <- datTraits$geo_accession
datTraits[] <- lapply(datTraits, function(x) {
  if (is.character(x) || is.factor(x)) {
    as.numeric(as.factor(x))
  } else {
    x  
  }
})
datTraits_clean <- datTraits[, sapply(datTraits, is.numeric)]

str(datTraits_clean)
## 'data.frame':    9 obs. of  4 variables:
##  $ geo_accession: num  1 2 3 4 5 6 7 8 9
##  $ Disease      : num  1 1 1 2 2 2 2 2 2
##  $ Age          : int  28 48 57 38 63 57 47 38 37
##  $ Gender       : num  1 1 1 1 1 1 1 1 1
sampleTree2 <- hclust(dist(datExpr), method = "average")
traitColors <- numbers2colors(datTraits_clean, signed = FALSE)
## Warning in numbers2colors(datTraits_clean, signed = FALSE): (some columns in)
## 'x' are constant. Their color will be the color of NA.
plotDendroAndColors(sampleTree2, traitColors,
                    groupLabels = names(datTraits_clean),
                    main = "Sample dendrogram and trait heatmap")

save(datExpr, datTraits_clean, file = "HTGdataInput.RData")
lnames = load(file = "HTGdataInput.RData");
#The variable lnames contains the names of loaded variables.
lnames
## [1] "datExpr"         "datTraits_clean"
# Filter the least variable genes (optional)
variance_genes <- apply(datExpr, 2, var)
datExpr_filtered <- datExpr[, variance_genes > quantile(variance_genes, 0.75)]
# Select a narrower range of powers due to few samples
powers = c(1:6)

sft = pickSoftThreshold(datExpr_filtered, powerVector = powers, verbose = 5)
## pickSoftThreshold: will use block size 5916.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 5916 of 5916
## Warning: executing %dopar% sequentially: no parallel backend registered
##   Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1     1    0.614  1.700          0.684    2490      2550   3460
## 2     2    0.180  0.334          0.162    1450      1410   2530
## 3     3    0.278 -0.227          0.102     968       869   2020
## 4     4    0.808 -0.494          0.754     702       574   1690
## 5     5    0.906 -0.645          0.883     536       399   1450
## 6     6    0.916 -0.737          0.896     424       286   1280
sizeGrWindow(9, 5)
par(mfrow = c(1, 2))
cex1 = 0.9

plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3]) * sft$fitIndices[,2],
     xlab = "Soft Threshold (power)", 
     ylab = "Scale Free Topology Model Fit, signed R^2", type = "n",
     main = "Scale independence")
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3]) * sft$fitIndices[,2], 
     labels = powers, cex = cex1, col = "red")
abline(h = 0.80, col = "red")  # Ajuste del umbral a 0.80

plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab = "Soft Threshold (power)", 
     ylab = "Mean Connectivity", type = "n",
     main = "Mean connectivity")
text(sft$fitIndices[,1], sft$fitIndices[,5], labels = powers, cex = cex1, col = "red")
net = blockwiseModules(
  datExpr, 
  power = 6,  
  TOMType = "unsigned",  
  minModuleSize = 200,  
  reassignThreshold = 0, 
  mergeCutHeight = 0.20,  
  numericLabels = TRUE, 
  pamRespectsDendro = FALSE,  
  saveTOMs = TRUE, 
  saveTOMFileBase = "TOM_HTG",  
  verbose = 3
)
##  Calculating module eigengenes block-wise from all genes
##    Flagging genes and samples with too many missing values...
##     ..step 1
##  ....pre-clustering genes to determine blocks..
##    Projective K-means:
##    ..k-means clustering..
##    ..merging smaller clusters...
## Block sizes:
## gBlocks
##    1    2    3    4    5 
## 4955 4954 4895 4484 4375 
##  ..Working on block 1 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 1 into file TOM_HTG-block.1.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##      ..removing 51 genes from module 1 because their KME is too low.
##      ..removing 26 genes from module 2 because their KME is too low.
##      ..removing 1 genes from module 3 because their KME is too low.
##      ..removing 5 genes from module 4 because their KME is too low.
##      ..removing 3 genes from module 5 because their KME is too low.
##      ..removing 2 genes from module 6 because their KME is too low.
##  ..Working on block 2 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 2 into file TOM_HTG-block.2.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##  ..Working on block 3 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 3 into file TOM_HTG-block.3.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##  ..Working on block 4 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 4 into file TOM_HTG-block.4.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##      ..removing 27 genes from module 1 because their KME is too low.
##      ..removing 9 genes from module 2 because their KME is too low.
##      ..removing 7 genes from module 3 because their KME is too low.
##      ..removing 1 genes from module 4 because their KME is too low.
##      ..removing 1 genes from module 6 because their KME is too low.
##      ..removing 2 genes from module 7 because their KME is too low.
##      ..removing 1 genes from module 8 because their KME is too low.
##      ..removing 1 genes from module 10 because their KME is too low.
##  ..Working on block 5 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 5 into file TOM_HTG-block.5.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##      ..removing 7 genes from module 1 because their KME is too low.
##      ..removing 1 genes from module 2 because their KME is too low.
##      ..removing 6 genes from module 3 because their KME is too low.
##      ..removing 4 genes from module 7 because their KME is too low.
##      ..removing 7 genes from module 8 because their KME is too low.
##      ..removing 1 genes from module 9 because their KME is too low.
##  ..merging modules that are too close..
##      mergeCloseModules: Merging modules whose distance is less than 0.2
##        Calculating new MEs...
sizeGrWindow(12, 9)

mergedColors = labels2colors(net$colors)

for (i in 1:length(net$dendrograms)) {
  plotDendroAndColors(
    net$dendrograms[[i]],            
    mergedColors[net$blockGenes[[i]]],  
    "Module colors",                  
    dendroLabels = FALSE, 
    hang = 0.03,                      
    addGuide = TRUE, guideHang = 0.05 
  )
}
table(net$colors)  # Conteo del número de genes por módulo
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##  163 7108 3309 2540  879  657  656  641  559  497  486  435  418  409  391  386 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29 
##  378  377  372  358  337  333  287  283  276  254  238  220  214  202
MEs = net$MEs  # # Module Eigengene

moduleTraitCor = cor(MEs, datTraits_clean, use = "pairwise.complete.obs")

moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))

print(moduleTraitCor)  
##      geo_accession     Disease          Age Gender
## ME8  -0.4382755148 -0.36457149 -0.104383861     NA
## ME9  -0.2352229168 -0.35510138  0.023690670     NA
## ME17 -0.5689647285 -0.37809909 -0.085707826     NA
## ME16 -0.4514915787 -0.09074511  0.543004884     NA
## ME20 -0.4970108215 -0.34308831  0.617882022     NA
## ME23 -0.8230491909 -0.65341026 -0.296925655     NA
## ME4  -0.5925959756 -0.90279162  0.006531993     NA
## ME6  -0.6305260798 -0.76633530  0.271146142     NA
## ME1  -0.8489723840 -0.82311878  0.280952178     NA
## ME5  -0.7865146094 -0.78344549  0.160073791     NA
## ME15 -0.1956299774  0.13870565 -0.208432057     NA
## ME11 -0.2483511781  0.21374683  0.301846403     NA
## ME25 -0.3241506570  0.04194377  0.408948816     NA
## ME28 -0.0612228444 -0.10532073  0.690944927     NA
## ME14 -0.5613228524 -0.37390689  0.314084857     NA
## ME2  -0.5543003743 -0.59802441  0.090798838     NA
## ME13 -0.4054150893 -0.20857445  0.398355252     NA
## ME21  0.1328138014  0.12629798  0.264375542     NA
## ME10 -0.2560984761 -0.34864712  0.571461193     NA
## ME27 -0.6499293996 -0.38002505  0.449144117     NA
## ME18  0.1866279473 -0.19554363  0.165176368     NA
## ME19 -0.6866338701 -0.72554878 -0.370975140     NA
## ME24 -0.1469863083 -0.50120779 -0.341675325     NA
## ME26 -0.3313068214 -0.54694617 -0.236598033     NA
## ME7   0.0001186404 -0.30538707 -0.522576307     NA
## ME12 -0.1172239434 -0.54019252 -0.281214220     NA
## ME22 -0.6211964781 -0.77746667 -0.490912573     NA
## ME3   0.7326304467  0.92741661 -0.040203096     NA
## ME29  0.1857184646  0.09076155 -0.231270352     NA
## ME0  -0.1400723272  0.02347029  0.061031718     NA
print(moduleTraitPvalue)  
##      geo_accession      Disease        Age Gender
## ME8    0.238005950 0.3347214098 0.78927234     NA
## ME9    0.542356011 0.3483617023 0.95176033     NA
## ME17   0.109863685 0.3157041563 0.82646276     NA
## ME16   0.222493890 0.8163994593 0.13084682     NA
## ME20   0.173451876 0.3660455168 0.07618251     NA
## ME23   0.006424560 0.0563291706 0.43780534     NA
## ME4    0.092651349 0.0008564954 0.98669361     NA
## ME6    0.068684953 0.0160177789 0.48035809     NA
## ME1    0.003790369 0.0064161850 0.46397266     NA
## ME5    0.011928080 0.0124986811 0.68078352     NA
## ME15   0.613958266 0.7219223841 0.59045916     NA
## ME11   0.519351148 0.5807981432 0.42987846     NA
## ME25   0.394767807 0.9146780205 0.27444163     NA
## ME28   0.875666762 0.7874157930 0.03930337     NA
## ME14   0.115813221 0.3215382858 0.41044472     NA
## ME2    0.121447371 0.0889477338 0.81629225     NA
## ME13   0.279017166 0.5901995794 0.28827619     NA
## ME21   0.733379651 0.7461034497 0.49181004     NA
## ME10   0.505960268 0.3578102041 0.10796092     NA
## ME27   0.058109517 0.3130418557 0.22520761     NA
## ME18   0.630667594 0.6141178192 0.67106239     NA
## ME19   0.041062079 0.0269294104 0.32565004     NA
## ME24   0.705900799 0.1692744734 0.36815314     NA
## ME26   0.383794391 0.1275197659 0.53992830     NA
## ME7    0.999758308 0.4242146378 0.14890875     NA
## ME12   0.763911087 0.1332519831 0.46353811     NA
## ME22   0.074170193 0.0136622913 0.17962553     NA
## ME3    0.024762720 0.0003158023 0.91820916     NA
## ME29   0.632363980 0.8163666686 0.54935720     NA
## ME0    0.719271484 0.9522086510 0.87605250     NA
# Extract the correlations and p-values for the trait "Disease"
cor_Disease = moduleTraitCor[, "Disease"]
pval_Disease = moduleTraitPvalue[, "Disease"]

best_module = which.max(abs(cor_Disease))

print(paste("Most significant module:", names(cor_Disease)[best_module]))
## [1] "Most significant module: ME3"
print(paste("Correlation:", cor_Disease[best_module]))
## [1] "Correlation: 0.927416610406747"
print(paste("P-value:", pval_Disease[best_module]))
## [1] "P-value: 0.000315802340459394"