Dataset: “expression data.csv”
In our first line of code we set our working directory. For security purposes, we choose to hide that portion of code. Next, we import a couple of libraries we shall need to first analyze the data.
library(dynamicTreeCut)
library(flashClust)
library(WGCNA)
Now, let’s load our dataset and rearrange it to the desired format.
stroke=read.table("expression data.csv",sep= ",",header=T,as.is=T)
gene.class=stroke[,2]
stroke=stroke[,-(1:3)]
Data=as.data.frame(t(stroke[,-1])) ## this is the data for subsequent analysis
names(Data)=gene.class#stroke[,1]
We now identify outlier microarray samples and plot the results.
sampleTree = flashClust(dist(log(t(Data[,1:1000]))), method = "average") # “average” can be replaced with “ward”
sizeGrWindow(12,9) # need to call “WGCNA” to use this function
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)
As we can see, the plot above is overcrowded. The network visualization later will provide better details.
Computing metrics
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(log(Data[,1:1000]), powerVector = powers, verbose = 5,blockSize=8504)
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 1000 of 1000
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.0584 -1.61 0.762 189.0000 1.85e+02 249.000
## 2 2 0.4560 -2.45 0.753 55.4000 5.20e+01 96.700
## 3 3 0.7680 -2.55 0.846 20.4000 1.81e+01 47.600
## 4 4 0.9100 -2.42 0.931 8.7000 7.17e+00 27.000
## 5 5 0.9570 -2.28 0.965 4.1600 3.16e+00 16.700
## 6 6 0.9650 -2.13 0.967 2.1600 1.50e+00 11.100
## 7 7 0.9810 -1.98 0.981 1.2100 7.60e-01 7.720
## 8 8 0.9640 -1.88 0.960 0.7130 4.03e-01 5.540
## 9 9 0.9710 -1.80 0.970 0.4410 2.24e-01 4.070
## 10 10 0.9770 -1.69 0.976 0.2850 1.28e-01 3.040
## 11 12 0.9690 -1.63 0.976 0.1300 4.57e-02 1.920
## 12 14 0.9420 -1.60 0.957 0.0663 1.72e-02 1.320
## 13 16 0.3240 -3.16 0.297 0.0365 6.90e-03 0.968
## 14 18 0.3330 -2.96 0.313 0.0215 2.81e-03 0.741
## 15 20 0.3620 -3.35 0.305 0.0133 1.22e-03 0.588
# Plot the results:
sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 = 0.9
# Scale-free topology fit index R2 as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[-(1:2),1], sft$fitIndices[-(1:2),5], xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n", main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
Computing Network adjacency and topological overlap matrices.
## calculate the adjacencies, using the soft thresholding power 5
softPower = 5
adjacency = adjacency(log(Data[,1:1000]), power = softPower)
# Turn adjacency into topological overlap matrix
TOM = TOMsimilarity(adjacency);
## ..connectivity..
## ..matrix multiplication..
## ..normalization..
## ..done.
dissTOM = 1-TOM
## Clustering using TOM
# Call the hierarchical clustering function
geneTree = flashClust(as.dist(dissTOM), method = "average")
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity", labels = FALSE, hang = 0.04)
# We like large modules, so we set the minimum module size relatively high:
#minModuleSize = 30
minModuleSize = 10
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM, deepSplit = 2, pamRespectsDendro = FALSE, minClusterSize = minModuleSize)
## ..cutHeight not given, setting it to 0.996 ===> 99% of the (truncated) height range in dendro.
## ..done.
table(dynamicMods)
## dynamicMods
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 24 194 135 121 88 61 48 44 42 40 31 30 27 26 25 24 20 20
#Label 0 is reserved for unassigned genes
### plot the module assignment under the gene dendrogram
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
## dynamicColors
## black blue brown cyan green
## 44 135 121 25 61
## greenyellow grey grey60 lightcyan magenta
## 30 24 20 20 40
## midnightblue pink purple red salmon
## 24 42 31 48 26
## tan turquoise yellow
## 27 194 88
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut", dendroLabels = FALSE,
hang = 0.03, addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
We shall later use the dynamicColor variable to color each module in the network.
### Merging of modules whose expression profiles are very similar
## To quantify co-expression similarity of entire modules, we calculate their eigengenes and cluster them on their correlation:
# Calculate eigengenes
MEList = moduleEigengenes(log(Data[,1:1000]), colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs); ### Is this correct?? Should use absolute correlation ???? YES ###
# Cluster module eigengenes
METree = flashClust(as.dist(MEDiss), method = "average");
# Plot the result
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes", xlab = "", sub = "")
## merge: choose a height cut of 0.2, corresponding to correlation of 0.8, to merge
MEDissThres = 0.2
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")
# Call an automatic merging function
merge = mergeCloseModules(log(Data[,1:1000]), dynamicColors, cutHeight = MEDissThres, verbose = 3)
## mergeCloseModules: Merging modules whose distance is less than 0.2
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 18 module eigengenes in given set.
## Calculating new MEs...
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 18 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors;
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs;
We are using the adjacency matrix to visualize the network
# genes network
library('igraph')
library('network')
library('networkD3')
library('intergraph')
m<-adjacency
# With networkD3 package
source=c()
target=c()
corr<-c()
g1<-gene.class[1:dim(m)[1]]
g2<-g1
for(gene in g1){
for(gen in g2){
if(m[gene,gen]<0.9999 & m[gene,gen]>0.3){
source<-c(source,gene)
target<-c(target,gen)
corr<-c(corr,m[gene,gen])
}
}
}
NetworkData <- data.frame(source, target)
Now, let’s visualize the network with the NetworkD3 package.
simpleNetwork(NetworkData,zoom = T)
Now, let’s use the igraph package with its tkplot() function.
# With igraph package
network<-data.frame(source,target,corr)
netw <- graph.data.frame(network)
V(netw)$color <- sample( c("red", "white"),vcount(netw), rep=TRUE)
E(netw)$color <- "grey"
red <- V(netw)[ color == "red" ]
bl <- V(netw)[ color == "green" ]
E(netw)[ red %--% red ]$color <- "red"
E(netw)[ bl %--% bl ]$color <- "green"
library(InteractiveIGraph)
# now it is interactive. Please enjoy :)
if(interactive()){
g = InteractiveIGraph(netw)
}
tkplot(netw, vertex.size=3, vertex.label=V(netw)$name,#label.size=1,
edge.arrow.size=0.1,
edge.color="black",
vertex.color="red", frame=TRUE)
## Loading required package: tcltk
## [1] 1
# More network graph
## nodes information with cluster membership
nodes<-as.data.frame(cbind(gene.class[1:1000],dynamicMods))
rownames(nodes)<-NULL
colnames(nodes) <- c("genes","cluster")
## Ties dataset
ties<-network
ties <- aggregate(ties[,3], ties[,-3], sum)
ties <- ties[order(ties$source, ties$target),]
colnames(ties)[3] <- "weight"
rownames(ties) <- NULL
# After the hard work is done, we can convert the data to an igraph object:
net <- graph.data.frame(ties, directed=F)
# One final touch is to removing loops from the graph, so the edges won’t appear that bushy:
net <- simplify(net, remove.multiple = F, remove.loops = T)
# A clean plot: nodes' colors. Here we set color to orange and the border color to hex #555555
tkplot(net, edge.arrow.size=10, edge.curved=0,
vertex.color="orange", vertex.frame.color="#555555")
## [1] 2
More network Visualization: In this visualization, the node size is proportionate to its degree and the edge color depends on the strength of the correlation between two related genes. Here, we only emphasize relatively high correlation in darkred and darkorange (with darkred being the relatively hgighest correlation).
# 1) Generate colors base on media type:
nodeIndex<-c()
for(name in V(net)$name){
nodeIndex<-c(nodeIndex,which(nodes$genes==name))
}
colors<-dynamicColors[nodeIndex]
V(net)$color <- colors
# 2) Compute node degree (#ties) and use it to set node size:
deg <- degree(net, mode="all")
V(net)$size <- deg*2
# 3) The labels are currently node IDs, setting them to NA will render no labels:
V(net)$label.color <- "black"
# 4) We can set edge width based on weight:
E(net)$width <- E(net)$weight*2
#5) changing arrow size and edge color:
E(net)$arrow.size <- 10
edgeColor<-c()
for(e in E(net)$weight){
ecol<-(e-mean(E(net)$weight))*5+1
if(ecol>=2){
edgeColor<-c(edgeColor,'darkred')
} else if(ecol>=1.10 & ecol<2){
edgeColor<-c(edgeColor,'darkorange')
}
else{
edgeColor<-c(edgeColor,'gray80')
}
}
E(net)$edge.color <- "gray80"
# Edge width
w<-(E(net)$weight-mean(E(net)$weight))*5+1
tkplot(net,edge.arrow.size=1,edge.curved=0,edge.width=w,edge.color=edgeColor)
## [1] 3