Dataset: “expression data.csv”

In our first line of code we set our working directory. For security purposes, we choose to hide that portion of code. Next, we import a couple of libraries we shall need to first analyze the data.

library(dynamicTreeCut)
library(flashClust) 
library(WGCNA) 

Now, let’s load our dataset and rearrange it to the desired format.

stroke=read.table("expression data.csv",sep= ",",header=T,as.is=T) 
gene.class=stroke[,2] 
stroke=stroke[,-(1:3)] 
Data=as.data.frame(t(stroke[,-1])) ## this is the data for subsequent analysis
names(Data)=gene.class#stroke[,1]

We now identify outlier microarray samples and plot the results.

sampleTree = flashClust(dist(log(t(Data[,1:1000]))), method = "average") # “average” can be replaced with “ward” 
sizeGrWindow(12,9)  # need to call “WGCNA” to use this function 
par(cex = 0.6); 
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2) 

As we can see, the plot above is overcrowded. The network visualization later will provide better details.

Network Topology Analysis

Computing metrics

# Choose a set of soft-thresholding powers 
powers = c(c(1:10), seq(from = 12, to=20, by=2)) 

# Call the network topology analysis function 
sft = pickSoftThreshold(log(Data[,1:1000]), powerVector = powers, verbose = 5,blockSize=8504) 
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 1000 of 1000
##    Power SFT.R.sq slope truncated.R.sq  mean.k. median.k.  max.k.
## 1      1   0.0584 -1.61          0.762 189.0000  1.85e+02 249.000
## 2      2   0.4560 -2.45          0.753  55.4000  5.20e+01  96.700
## 3      3   0.7680 -2.55          0.846  20.4000  1.81e+01  47.600
## 4      4   0.9100 -2.42          0.931   8.7000  7.17e+00  27.000
## 5      5   0.9570 -2.28          0.965   4.1600  3.16e+00  16.700
## 6      6   0.9650 -2.13          0.967   2.1600  1.50e+00  11.100
## 7      7   0.9810 -1.98          0.981   1.2100  7.60e-01   7.720
## 8      8   0.9640 -1.88          0.960   0.7130  4.03e-01   5.540
## 9      9   0.9710 -1.80          0.970   0.4410  2.24e-01   4.070
## 10    10   0.9770 -1.69          0.976   0.2850  1.28e-01   3.040
## 11    12   0.9690 -1.63          0.976   0.1300  4.57e-02   1.920
## 12    14   0.9420 -1.60          0.957   0.0663  1.72e-02   1.320
## 13    16   0.3240 -3.16          0.297   0.0365  6.90e-03   0.968
## 14    18   0.3330 -2.96          0.313   0.0215  2.81e-03   0.741
## 15    20   0.3620 -3.35          0.305   0.0133  1.22e-03   0.588
# Plot the results: 
sizeGrWindow(9, 5) 
par(mfrow = c(1,2))
cex1 = 0.9
# Scale-free topology fit index R2 as a function of the soft-thresholding power 
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence")); 
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,cex=cex1,col="red")

# this line corresponds to using an R^2 cut-off of h 
abline(h=0.90,col="red") 

# Mean connectivity as a function of the soft-thresholding power 
plot(sft$fitIndices[-(1:2),1], sft$fitIndices[-(1:2),5], xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n", main = paste("Mean connectivity")) 
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

Computing Network adjacency and topological overlap matrices.

## calculate the adjacencies, using the soft thresholding power 5 
softPower = 5
adjacency = adjacency(log(Data[,1:1000]), power = softPower)


# Turn adjacency into topological overlap matrix 
TOM = TOMsimilarity(adjacency); 
## ..connectivity..
## ..matrix multiplication..
## ..normalization..
## ..done.
dissTOM = 1-TOM 

## Clustering using TOM 
# Call the hierarchical clustering function 
geneTree = flashClust(as.dist(dissTOM), method = "average")

# Plot the resulting clustering tree (dendrogram) 
sizeGrWindow(12,9) 
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity", labels = FALSE, hang = 0.04)

Module detection

# We like large modules, so we set the minimum module size relatively high: 
#minModuleSize = 30
minModuleSize = 10

# Module identification using dynamic tree cut: 
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM, deepSplit = 2, pamRespectsDendro = FALSE, minClusterSize = minModuleSize)
##  ..cutHeight not given, setting it to 0.996  ===>  99% of the (truncated) height range in dendro.
##  ..done.
table(dynamicMods) 
## dynamicMods
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##  24 194 135 121  88  61  48  44  42  40  31  30  27  26  25  24  20  20
#Label 0 is reserved for unassigned genes 

### plot the module assignment under the gene dendrogram 
# Convert numeric lables into colors 
dynamicColors = labels2colors(dynamicMods) 
table(dynamicColors) 
## dynamicColors
##        black         blue        brown         cyan        green 
##           44          135          121           25           61 
##  greenyellow         grey       grey60    lightcyan      magenta 
##           30           24           20           20           40 
## midnightblue         pink       purple          red       salmon 
##           24           42           31           48           26 
##          tan    turquoise       yellow 
##           27          194           88
# Plot the dendrogram and colors underneath 
sizeGrWindow(8,6) 
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut", dendroLabels = FALSE,
                    hang = 0.03, addGuide = TRUE, guideHang = 0.05, 
                    main = "Gene dendrogram and module colors")

We shall later use the dynamicColor variable to color each module in the network.

### Merging of modules whose expression profiles are very similar 
## To quantify co-expression similarity of entire modules, we calculate their eigengenes and cluster them on their correlation: 

# Calculate eigengenes 
MEList = moduleEigengenes(log(Data[,1:1000]), colors = dynamicColors) 
MEs = MEList$eigengenes 

# Calculate dissimilarity of module eigengenes 
MEDiss = 1-cor(MEs);  ### Is this correct?? Should use absolute correlation ????  YES ### 

# Cluster module eigengenes 
METree = flashClust(as.dist(MEDiss), method = "average"); 

# Plot the result 
sizeGrWindow(7, 6) 
plot(METree, main = "Clustering of module eigengenes", xlab = "", sub = "") 
## merge: choose a height cut of 0.2, corresponding to correlation of 0.8, to merge 
MEDissThres = 0.2 

# Plot the cut line into the dendrogram 
abline(h=MEDissThres, col = "red") 

# Call an automatic merging function 
merge = mergeCloseModules(log(Data[,1:1000]), dynamicColors, cutHeight = MEDissThres, verbose = 3) 
##  mergeCloseModules: Merging modules whose distance is less than 0.2
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 18 module eigengenes in given set.
##    Calculating new MEs...
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 18 module eigengenes in given set.
# The merged module colors 
mergedColors = merge$colors; 

# Eigengenes of the new merged modules: 
mergedMEs = merge$newMEs; 

Network Visualization

We are using the adjacency matrix to visualize the network

# genes network
library('igraph')
library('network')
library('networkD3')
library('intergraph')
m<-adjacency
# With networkD3 package
source=c()
target=c()
corr<-c()
g1<-gene.class[1:dim(m)[1]]
g2<-g1

for(gene in g1){
  for(gen in g2){
    if(m[gene,gen]<0.9999 & m[gene,gen]>0.3){
      source<-c(source,gene)
      target<-c(target,gen)
      corr<-c(corr,m[gene,gen])
    }
  }
}
NetworkData <- data.frame(source, target)

Now, let’s visualize the network with the NetworkD3 package.

simpleNetwork(NetworkData,zoom = T)

Now, let’s use the igraph package with its tkplot() function.

# With igraph package
network<-data.frame(source,target,corr)
netw <- graph.data.frame(network)

V(netw)$color <- sample( c("red", "white"),vcount(netw), rep=TRUE)
E(netw)$color <- "grey"

red <- V(netw)[ color == "red" ]
bl <- V(netw)[ color == "green" ]
E(netw)[ red %--% red ]$color <- "red"
E(netw)[ bl %--% bl ]$color <- "green"

library(InteractiveIGraph)

# now it is interactive. Please enjoy :)
if(interactive()){
  g = InteractiveIGraph(netw)
}
tkplot(netw, vertex.size=3, vertex.label=V(netw)$name,#label.size=1,
     edge.arrow.size=0.1,
     edge.color="black",
     vertex.color="red", frame=TRUE)
## Loading required package: tcltk
## [1] 1

# More network graph

## nodes information with cluster membership
nodes<-as.data.frame(cbind(gene.class[1:1000],dynamicMods))
rownames(nodes)<-NULL
colnames(nodes) <- c("genes","cluster")
## Ties dataset
ties<-network
ties <- aggregate(ties[,3], ties[,-3], sum)
ties <- ties[order(ties$source, ties$target),]
colnames(ties)[3] <- "weight"
rownames(ties) <- NULL

# After the hard work is done, we can convert the data to an igraph object:
net <- graph.data.frame(ties, directed=F)

# One final touch is to removing loops from the graph, so the edges won’t appear that bushy:
net <- simplify(net, remove.multiple = F, remove.loops = T)

# A clean plot: nodes' colors. Here we set color to orange and the border color to hex #555555
tkplot(net, edge.arrow.size=10, edge.curved=0,
     vertex.color="orange", vertex.frame.color="#555555")
## [1] 2

More network Visualization: In this visualization, the node size is proportionate to its degree and the edge color depends on the strength of the correlation between two related genes. Here, we only emphasize relatively high correlation in darkred and darkorange (with darkred being the relatively hgighest correlation).

# 1) Generate colors base on media type:
nodeIndex<-c()
for(name in V(net)$name){
  nodeIndex<-c(nodeIndex,which(nodes$genes==name))
}
colors<-dynamicColors[nodeIndex]
V(net)$color <- colors

# 2) Compute node degree (#ties) and use it to set node size:
deg <- degree(net, mode="all")
V(net)$size <- deg*2 

# 3) The labels are currently node IDs, setting them to NA will render no labels:
V(net)$label.color <- "black"

# 4) We can set edge width based on weight:
E(net)$width <- E(net)$weight*2

#5) changing arrow size and edge color:
E(net)$arrow.size <- 10
edgeColor<-c()
for(e in E(net)$weight){
  ecol<-(e-mean(E(net)$weight))*5+1
  if(ecol>=2){
    edgeColor<-c(edgeColor,'darkred')
  } else if(ecol>=1.10 & ecol<2){
    edgeColor<-c(edgeColor,'darkorange')
  }
  else{
    edgeColor<-c(edgeColor,'gray80')
  }
}
E(net)$edge.color <- "gray80"


# Edge width
w<-(E(net)$weight-mean(E(net)$weight))*5+1
tkplot(net,edge.arrow.size=1,edge.curved=0,edge.width=w,edge.color=edgeColor)
## [1] 3