rm(list=ls())
library(Rtsne)
library(ggplot2)
library(RColorBrewer)
library(paletteer)
# theme
top.mar=0.1
right.mar=0.1
bottom.mar=0.1
left.mar=0.1
mytheme <- theme_classic() +
theme(plot.title = element_text(hjust = 0.5,size = 18, face = 'bold'),
axis.text = element_text(size = 14),
axis.title = element_text(size = 16, face = 'bold'),
axis.text.x = element_text(size = 14, vjust = 0.5), # vjust = -0.001
legend.text = element_text(size = 12),
legend.title = element_text(size = 16),
legend.position = 'none',
plot.margin=unit(x=c(top.mar,right.mar,bottom.mar,left.mar),
units="inches"))
mytheme1 <- theme_classic() +
theme(plot.title = element_text(hjust = 0.5,size = 18, face = 'bold'),
axis.text = element_text(size = 10),
axis.title = element_text(size = 14, face = 'bold'),
axis.text.x = element_text(size = 10, vjust = 0.5), # vjust = -0.001
legend.text = element_text(size = 12),
# legend.title = element_blank(),
legend.title = element_text(size = 16),
legend.position = 'right',
plot.margin=unit(x=c(top.mar,right.mar,bottom.mar,left.mar),
units="inches"))
# pallete
Pale_51 <- as.vector(paletteer_d('ggsci::default_igv'))
pale_9_1 <- as.vector(paletteer_d('ggprism::pastels'))
pale_20_2 <- as.vector(paletteer_d('ggthemes::Tableau_20'))
pale_11_2 <- as.vector(paletteer_d('khroma::sunset'))
pale_12_2 <- as.vector(paletteer_d('RColorBrewer::Set3'))
pale_8_1 <- as.vector(paletteer_d('RColorBrewer::Pastel2'))
pale_29 <- c(pale_20_2, pale_9_1)
pale_28 <- c(pale_8_1, pale_11_2, pale_9_1)
pale_31 <- c(pale_8_1, pale_11_2, pale_12_2)
pale_32 <- c(pale_12_2, pale_20_2)
pale_25 <- c(pale_12_2, pale_11_2, '#85A4FE', 'grey')
pale_29 <- c(pale_25, pale_8_1)
pale_10 <- as.vector(paletteer_d('ggsci::default_jco'))
pale_51 <- as.vector(paletteer_d('ggsci::default_igv'))
cluster_pale <- c('#8DD3C7FF','#BEBADAFF',"#FB8072FF", "#FDB462FF")
Basic variables:
label_path,
output_path
graphdata.path
fig.path
kegg_anno.path
Sub_Swiss.path,
graphdata,
labels,
kegg_anno,
Sub_Swiss,
matched_swiss,
matched_kegg
basic_var <- function(dataset, group, SubDataset){
# --- define path ---
label_path <- paste("./data/", dataset, '/output_data/', SubDataset, '/generated_data/types.txt', sep = '')
output_path <- paste("./data/", dataset, "/inputdata/", sep = '')
graphdata.path <- paste("./data/", dataset, "/inputdata/graphdata_", group, '.csv', sep = '')
fig.path <- paste("./data/", dataset, "/output_data/Figures", sep = '')
graphdata.labeled.path <- paste("./data/", dataset, "/inputdata/graphdata_", group, 'labeled.csv', sep = '')
# annotation
kegg_anno.path <- paste("./data/", dataset, "/rawdata/kegg_annotation_all.csv", sep = '')
Sub_Swiss.path <- paste("./data/", dataset, "/inputdata/SubSwissExpre.csv", sep = '')
# --- load data ---
graphdata <- read.csv(graphdata.path, header = TRUE)
labels <- paste('Cluster',as.vector(apply(read.table(label_path, sep='\t', header = F)+1, 2, as.factor)))
graphdata$Cluster <- labels
# save labeled data
if (file.exists(graphdata.labeled.path)){
print(paste("File", graphdata.labeled.path, "exists."))
} else {
write.csv(graphdata, graphdata.labeled.path, row.names = FALSE)
}
# load kegg annotation
if (file.exists(kegg_anno.path)){
kegg_anno <- read.csv(kegg_anno.path)
matched_kegg <- merge(graphdata[,-(length(graphdata)-2)], kegg_anno, by = 'id')
Sub_Swiss = NA
matched_swiss = NA
} else {
print('Skipped: lack of kegg annotation data.')
}
if (file.exists(Sub_Swiss.path)){
Sub_Swiss <- read.csv(Sub_Swiss.path)
matched_swiss <- merge(graphdata[-c((length(graphdata)-4):(length(graphdata)-1))], Sub_Swiss, by = 'id')
matched_kegg <- merge(matched_swiss, kegg_anno, by = 'id')
}
# return var list
var_list <- list(label_path, output_path, graphdata.path, fig.path, kegg_anno.path,
Sub_Swiss.path,
graphdata, labels,
kegg_anno, Sub_Swiss,
matched_swiss, matched_kegg)
return(var_list)
}
Clustering <- function(dataset, group, SubDataset){
var_list <- basic_var(dataset, group, SubDataset)
# # --- define path ---
# label_path <- var_list[[1]]
# output_path <- var_list[[2]]
# graphdata.path <- var_list[[3]]
# fig.path <- var_list[[4]]
# --- load data ---
graphdata <- var_list[[7]]
labels <- var_list[[8]]
# --- visualization ---
library(tidyverse)
library(cowplot)
library(ggplot2)
p <- ggplot(graphdata, aes(tSNE1,tSNE2, colour = Cluster)) +
geom_point(size=2, alpha = 0.8) +
# stat_ellipse(mapping=aes(group = Cluster,colour = Cluster), #分组边界圈
# geom = "path", # other: polygon
# linetype = 2,
# size=0.6,
# alpha=0.5) +
xlab(NULL) +
ylab(NULL) +
theme_cowplot() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.title = element_blank(),
panel.border = element_blank(),
axis.line.x = element_line(color = "black", size = 0.5),
axis.line.y = element_line(color = "black", size = 0.5),
panel.background = element_blank()
) +
scale_colour_manual(values = pale_25)
fig.path <- paste("./data/", dataset, "/output_data/Figures", sep = '')
ggsave(filename = paste('ClusteringLabeled_', group,'.pdf', sep = ''), width = 6, height = 5, units = 'in', path = fig.path)
return(p)
}
Annotation <- function(dataset, group, SubDataset,
GeneSet_bykegg=0,
GeneSet_byswiss=0,
GeneSet_bycluster=0){
"
return: only save data in output path
"
# load data
var_list <- basic_var(dataset, group, SubDataset)
matched_kegg <- var_list[[length(var_list)]]
# extract and save gene set
if (GeneSet_bycluster[[1]] != 0){
# --- extract functional gene set by cluster ---
# -- cluster 1 --
for (i in 1:length(GeneSet_bycluster)){
Cluster <- filter(matched_kegg, matched_kegg$Cluster == GeneSet_bycluster[[i]])
### save data
print(paste('The total gene number of', GeneSet_bycluster[[i]], 'is:', length(unique(Cluster$id))) ) # 26 genes in total
output_path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', GeneSet_bycluster[[i]], '.csv', sep = '')
write.csv(Cluster, output_path, row.names = FALSE)
}
} else if (GeneSet_byswiss[[1]] != 0){
# --- extract functional gene set by gene name ---
for (i in 1:length(GeneSet_byswiss)){
for (j in 1:length(GeneSet_byswiss[[i]])){
result.temp <- matched_kegg[grepl(GeneSet_byswiss[[i]][j], matched_kegg$SwissProt_Description), ]
if (j == 1){
result <- result.temp
} else{
result <- rbind(result, result.temp)
}
}
print(paste('The total gene number of', GeneSet_byswiss[[i]], 'is:', length(unique(result$id))) )
output_path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', GeneSet_byswiss[[i]][j], '.csv', sep = '')
write.csv(result, output_path, row.names = FALSE)
}
} else if (GeneSet_bykegg[[1]] != 0){
# --- extract functional gene set by gene name ---
for (i in 1:length(GeneSet_bykegg)){
for (j in 1:length(GeneSet_bykegg[[i]])){
result.temp <- matched_kegg[grepl(GeneSet_bykegg[[i]][j], matched_kegg$level3_pathway_name), ]
if (j == 1){
result <- result.temp
} else{
result <- rbind(result, result.temp)
}
}
print(paste('The total gene number of', GeneSet_bykegg[[i]], 'is:', length(unique(result$id))) ) # 26 genes in total
output_path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', GeneSet_bykegg[[i]][j], '.csv', sep = '')
write.csv(result, output_path, row.names = FALSE)
}
}
}
# --- define path ---
obtain_pathdata <- function(dataset, group, SubDataset,
Cluster){
rpkm.path <- paste('./data/', dataset, "/rawdata/RPKM.txt", sep = '')
Cluster.path <- paste('./data/', dataset, "/output_data/R/", SubDataset, '_', Cluster, '.csv', sep = '')
group.path <- paste('./data/', dataset, "/inputdata/group.csv", sep = '')
kegg_anno.path <- paste('./data/', dataset, "/rawdata/kegg_annotation_all.csv", sep = '')
Enrichment.path <- paste('./data/', dataset, '/output_data/R/', SubDataset, '_', Cluster, '_Enrichment.csv', sep = '')
# --- load data ---
Cluster.data <- read.csv(Cluster.path, header = T)
rpkm <- read.table(rpkm.path, sep='\t', header = T)
group <- read.csv(group.path)
# --- combine data ---
rawdata <- merge(rpkm, Cluster.data[,-c(2:length(rpkm))], by = 'id')
samples <- colnames(rawdata)[2:length(rpkm)]
print('The samples and groups are as follows:')
print(samples)
print(group)
return (rawdata)
}
obtain_pvalue <- function(dataset, ref_group, group_sample.data, pathway, annotation.level){
# prepare data
group <- group_sample.data
temp <- data.frame(t(pathway))
colnames(temp) <- temp[1,]
temp <- temp[-1,]
name <- colnames(temp)
library(ggpubr)
temp <- data.frame(apply(temp[,1:(length(temp))], 2, as.numeric))
temp$group <- group$Group
# initial comparison
compare_data <- temp[, c(1, length(temp))]
colnames(compare_data) <- c('Pathway', 'group')
result <- compare_means(Pathway~group, data = compare_data, ref.group = ref_group,
method = 't.test',
paired = FALSE)
p_value <- data.frame(t(result$p))
expre_group.temp <- unique(group$Group)
loc <- match(ref_group, expre_group.temp)
expre_group <- expre_group.temp[-loc]
colnames(p_value) <- expre_group
# loop
for (i in 2:(length(temp)-1)){
compare_data <- temp[, c(i, length(temp))]
colnames(compare_data) <- c('Pathway', 'group')
result <- compare_means(Pathway ~ group, data = compare_data, ref.group = ref_group,
method = 't.test',
paired = FALSE)
p <- data.frame(t(result$p))
colnames(p) <- expre_group
p_value <- rbind(p_value, p)
}
rownames(p_value) <- name
p_value$pathway <- rownames(p_value)
colnames(p_value) <- c(paste('pvalue', colnames(p_value)[1:(length(p_value)-1)], sep = '_'), annotation.level)
df.pathlevel_p <- merge(pathway, p_value, by = annotation.level)
return(df.pathlevel_p)
}
ObtainStatistic <- function(pathway, df.pathlevel_p, group_sample.data){
mean <- apply(pathway[,2:(length(pathway)-1)], 1, mean)
mean_list <- list()
sd_list <- list()
for (i in 1:((length(pathway)-1)/3)){
sd.temp <- as.data.frame(apply(pathway[,(i*3-1):(i*3+1)], 1, sd))
mean.temp <- as.data.frame(apply(pathway[,(i*3-1):(i*3+1)], 1, mean))
sd_list[i] <- sd.temp
mean_list[i] <- mean.temp
}
for (i in 2:((length(pathway)-1)/3)){
df.pathlevel_p[SubDataset] <- mean_list[[i]]/mean_list[[1]]
df.pathlevel_p[unique(group_sample.data$Group)[i]] <- mean_list[[i]]
}
return(df.pathlevel_p)
}
last_chara <- function(string, chara=';'){
result <- sub(paste0(".*", chara), "", string)
return (result)
}
enrich <- function(df.pathlevel_p, SubDataset, dataset, Cluster,
p_thre, FC_up, FC_down, Expre,
group_sample.data){
Path.filtered <- filter(df.pathlevel_p, (df.pathlevel_p[SubDataset] > FC_up | df.pathlevel_p[SubDataset] < FC_down) &
df.pathlevel_p[colnames(df.pathlevel_p[length(df.pathlevel_p)-2])] < p_thre &
df.pathlevel_p[colnames(df.pathlevel_p[length(df.pathlevel_p)])] > Expre)
Enrichment.path <- paste('./data/', dataset, '/output_data/R/', SubDataset, '_', Cluster, '_Enrichment.csv', sep = '')
if (file.exists(Enrichment.path)){
print(paste('File', Enrichment.path, 'Exist.'))
} else {
write.csv(Path.filtered, Enrichment.path, row.names = FALSE)
}
plot_data <- Path.filtered
kegg_anno <- read.csv(paste("./data/", dataset, "/rawdata/kegg_annotation_all.csv", sep = ''))
# match to obtain kegglevel2 labels
plot_data$KEGGBrite2 <- kegg_anno[match(plot_data$level3_pathway_name, kegg_anno$level3_pathway_name),]$level2_pathway_name
# extract the last level (after ';')
plot_data$kegglevel3 <- apply(plot_data['level3_pathway_name'], 2, last_chara)
plot_data$kegglevel2 <- apply(plot_data['KEGGBrite2'], 2, last_chara)
# obtain data for visualization
start_index <- length(group_sample.data$Samples)+2
data <- plot_data[, c(1, start_index:(start_index+5))]
data$Expression <- apply(plot_data[,(start_index-3):(start_index-1)], 1, mean)
colnames(data) <- c('level3_pathway_name', 'pvalue', 'FoldChange', 'NHQ_mean', 'KEGGBrite2', 'kegglevel3', 'kegglevel2', 'Expression')
# visualization
library(ggrepel)
p <- ggplot(data, aes(x = pvalue, y = FoldChange, size = Expression, color = kegglevel2)) +
geom_point(alpha=0.7)
p <- p + geom_text_repel(data = data,
aes(pvalue, y = FoldChange, label = kegglevel3),
size=4,color="grey20",
point.padding = 0.5,hjust = 0.1,
segment.color="grey20",
segment.size=0.6,
segment.alpha=0.8,
nudge_x=0,
nudge_y=0,
max.overlaps = 10
) +
labs(x = 'P-value', y = 'Fold change', title = fig.name)
fig <- p + scale_size(range = c(2, 20), name="Expression") +
scale_color_manual(values = pale_20_2) + #'#006C75', #36CFC8',
mytheme1
fig.path <- paste("./data/", dataset, "/output_data/Figures", sep = '')
ggsave(filename = paste('Enrichment_', SubDataset,'.pdf', sep = ''), width = 10, height = 7.5, units = 'in', path = fig.path)
return(fig)
}
GPsEnrichCompare <- function(dataset, group, SubDataset, k=7){
'
:para k: the number of cluster
'
# labeling gene panels
plotdata_cluster.list <- list()
for (i in 1:k){
Cluster <- paste('Cluster', i)
cluster_path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', Cluster, '_Enrichment.csv', sep = '')
plotdata_cluster.list[[i]] <- read.csv(cluster_path)
plotdata_cluster.list[[i]]$Cluster <- Cluster
if (i == 1){
plotdata_cluster <- as.data.frame(plotdata_cluster.list[[1]])
} else {
plotdata_cluster <- rbind(plotdata_cluster, plotdata_cluster.list[[i]])
}
}
# prepare plot data
plotdata_cluster$Expression <- apply(plotdata_cluster[,7:9], 1, mean)
plotdata <- plotdata_cluster[,-c(2:7)]
colnames(plotdata) <- c('level3_pathway_name', 'pvalue', 'FoldChange', group, 'Cluster', 'Expression')
# plot
library(ggrepel)
p <- ggplot(plotdata,
aes(x = pvalue, y = FoldChange, size = Expression, color = Cluster)) +
geom_point(alpha=0.7)
p <- p + geom_text_repel(data = plotdata,
aes(pvalue, y = FoldChange,label = level3_pathway_name),
size=4,color="grey20",
point.padding = 0.5,hjust = 0.1,
segment.color="grey20",
segment.size=0.6,
segment.alpha=0.8,
nudge_x=0,
nudge_y=0,
max.overlaps = 11
) +
labs(x = 'P-value', y = 'Fold change', title = group)
p <- p + scale_size(range = c(1, 15), name="Expression") +
scale_color_manual(values = pale_10) +
mytheme1
# save fig
fig.path <- paste("./data/", dataset, "/output_data/Figures", sep = '')
ggsave(filename = paste('GPsErichComparison_', group,'.pdf', sep = ''), width = 9, height = 8, units = 'in', path = fig.path)
return(p)
}
Search_GePa <- function(var_list, SearchGene=TRUE, SearchPathway=FALSE){
matched_kegg <- var_list[[length(var_list)]]
if (SearchGene == TRUE){
annotation <- 'ko_des'
# grep gene info list
kodes <- matched_kegg[grepl(interested.gene, matched_kegg[annotation][,1]), ]
# extract cluster
gene.clu.distri <- table(kodes$Cluster)
print(paste('Most of', interested.gene, 'related gene belong to:'))
print(gene.clu.distri[grepl(max(gene.clu.distri), gene.clu.distri)])
print(paste('The cluster distribution of', interested.gene, 'is:'))
print(gene.clu.distri)
# obtain cluster
temp <- as.data.frame(gene.clu.distri[grepl(max(gene.clu.distri), gene.clu.distri)])
Cluster <- rownames(temp)
# print(paste('Gene', interested.gene, 'belongs to', Cluster))
} else if (SearchPathway == TRUE){
annotation <- 'level3_pathway_name'
# grep gene info list
level3_path <- matched_kegg[grepl(interested.pathway, matched_kegg[annotation][,1]), ]
# extract cluster
path.clu.distri <- table(level3_path$Cluster)
print(paste('Most of ', interested.gene, 'related gene belong to:'))
print(path.clu.distri[grepl(max(path.clu.distri), path.clu.distri)])
print(paste('The cluster distribution of', interested.pathway, 'is:'))
print(path.clu.distri)
# obtain cluster
temp <- as.data.frame(gene.clu.distri[grepl(max(gene.clu.distri), gene.clu.distri)])
Cluster <- rownames(temp)
}
return(Cluster)
}
ObtainCorNet <- function(dataset, SubDataset, Cluster, group, group_sample.data,
gene_name=TRUE, thre.p = 0.05, thre.r = 0.8){
# obtain filtered pathway
Filteredcluster.path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', Cluster, '_Enrichment.csv', sep = '')
Filteredcluster <- read.csv(Filteredcluster.path)
# obtain pathway list & matching & obtain gene list
rawdata <- obtain_pathdata(dataset, group, SubDataset, Cluster)
# obtain ko descrption as labels
kegg_anno.path <- paste("./data/", dataset, "/rawdata/kegg_annotation_all.csv", sep = '')
kegg_anno <- read.csv(kegg_anno.path)
path_list <- Filteredcluster$level3_pathway_name
# initial
path <- path_list[1]
# for loop
gene_list <- rawdata[grepl(path, rawdata$level3_pathway_name), ]
for (path in path_list[2:length(path_list)]){
list <- rawdata[grepl(path, rawdata$level3_pathway_name), ]
gene_list <- rbind(gene_list, list)
}
Col.name <- c('id',
group_sample.data$Samples,
# "SignalP", "tmhmm" ,
"SwissProt_ID", "SwissProt_Description",
"level1_pathway_id", "level1_pathway_name", "level2_pathway_id", "level2_pathway_name", "level3_pathway_id", "level3_pathway_name",
'ko', 'ko_name', 'ko_des', 'level3_pathway_name')
CorCol.name <- c('id', group_sample.data$Samples, 'ko_name')
Hub_group_raw <- unique(gene_list[Col.name])
Hub_group <- unique(Hub_group_raw[CorCol.name])
merge_data <- unique(Hub_group)
if (gene_name == TRUE) {
# v1: merge gene_id and gene description
gene_names <- data.frame(paste(merge_data$ko_name, ' (', merge_data$id, ')', sep = ''))
colnames(gene_names) <- 'label'
corr_data <- cbind(gene_names, merge_data[,2:(length(merge_data)-1)])
rownames(corr_data) <- corr_data$label
data <- t(corr_data[, -1])
} else {
# v2: only gene id
corr_data <- merge_data
rownames(corr_data) <- corr_data$id
data <- t(corr_data[, -c(1, length(corr_data))])
}
# calculate correlation matrix
library(psych)
occor = corr.test(data,use="pairwise",method="spearman",adjust="fdr",alpha=0.05)
occor.r = occor$r # 取相关性矩阵R值
occor.p = occor$p # 取相关性矩阵p值
occor.r[occor.p>thre.p | abs(occor.r) < thre.r ] = 0
occor.path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', Cluster, '_p', thre.p , '_r', thre.r, '_CorNet.csv', sep = '')
if (file.exists(occor.path)) {
print(paste('File', occor.path, 'exists.'))
} else {
write.csv(occor.r, occor.path)
}
return(occor.r)
}
ObtainGraph <- function(dataset, SubDataset, Cluster, group, group_sample.data, igraph, gene_name=TRUE, thre.p = 0.05, thre.r = 0.8){
# === nodes ===
E(igraph)$corr <- E(igraph)$weight
E(igraph)$weight <- abs(E(igraph)$weight)
# nodes number check
print(paste('The total nodes number is:',length(V(igraph)$name))) #或 vcount(igraph)
#节点度(Degree)
V(igraph)$degree <- degree(igraph)
# V(igraph)$degree
# --- Visualization: Degree distribution ---
if (ExploreDegree == TRUE){
degree_dist <- degree.distribution(igraph)[-1]
degree_num <- 1:max(V(igraph)$degree)
par(mfrow = c(1, 2))
hist(V(igraph)$degree, xlab = 'Degree', ylab = 'Frequency',
main = 'Degree distribution')
plot(degree_num, degree_dist, log = 'xy',
xlab = 'Log-degree',
ylab = 'Log-intensity',
main = 'Log-log degree distribution')
}
# --- Exploratory Analysis: Relationship between node degrees and the average degree of its "neighbors" ---
if (ExploreNeibor == TRUE){
neighbor_degree <- graph.knn(igraph, V(igraph))$knn
plot(V(igraph)$degree, neighbor_degree, log = 'xy',
xlab = 'Log degree', ylab = 'Log average neighbor degree')
}
#加权度(Weighted degree)
V(igraph)$weight_degree <- strength(igraph)
# V(igraph)$weight_degree
#接近中心性(Closeness centrality)
V(igraph)$closeness_centrality <- closeness(igraph)
# V(igraph)$closeness_centrality
#介数中心性(Betweenness centrality)
V(igraph)$betweenness_centrality <- betweenness(igraph)
# V(igraph)$betweenness_centrality
#特征向量中心性(Eigenvector centrality)
V(igraph)$eigenvector_centrality <- evcent(igraph)$vector
# V(igraph)$eigenvector_centrality
# --- Exploratory Analysis: The relationship between three features describing node centrality ---
if (ExploreNeibor == TRUE){
library(car)
scatter3d(V(igraph)$closeness_centrality, V(igraph)$betweenness_centrality, V(igraph)$eigenvector_centrality, xlab = 'Closeness centrality', ylab = 'Betweenness centrality', zlab = 'Eigenvector centrality', surface = FALSE)
# Relationship between node degree and node centrality, e.g. the relationship with feature vector centrality
plot(V(igraph)$degree, V(igraph)$eigenvector_centrality,
xlab = 'Degree', ylab = 'Eigenvector centrality')
}
# obtainn node list
node_list <- data.frame(node_id = V(igraph)$name,
degree = V(igraph)$degree,
weight_degree = V(igraph)$weight_degree,
closeness_centrality = V(igraph)$closeness_centrality,
betweenness_centrality = V(igraph)$betweenness_centrality,
eigenvector_centrality = V(igraph)$eigenvector_centrality)
# save node
node.path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', Cluster, '_p', thre.p , '_r', thre.r, '_Node.csv', sep = '')
write.table(node_list, node.path, sep = 't', row.names = FALSE, quote = FALSE)
# === Edges ===
#权重(Weighted)
E(igraph)$weight
#边介数中心性(Edge betweenness centrality)
E(igraph)$betweenness_centrality <- edge.betweenness(igraph)
E(igraph)$betweenness_centrality
# obtain edge list
edge <- data.frame(as_edgelist(igraph)) #igraph
edge_list <- data.frame(
source = edge[[1]],
target = edge[[2]],
weight = E(igraph)$weight,
correlation = E(igraph)$corr,
betweenness_centrality = E(igraph)$betweenness_centrality
)
# save edge
edge.path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', Cluster, '_p', thre.p , '_r', thre.r, '_Edges.csv', sep = '')
write.table(edge_list, edge.path, sep = 't', row.names = FALSE, quote = FALSE)
graph_list <- list(node_list, edge_list)
return(graph_list)
}
obtain_landmark <- function(graph_list, dataset, group, SubDataset,
Cluster, group_sample.data){
# landmark genes: merge nodes list with expression level
node_list <- graph_list[[1]]
edge_list <- graph_list[[2]]
node_list$id <- node_list$node_id
# var_list <- basic_var(dataset, group, SubDataset)
# matched_kegg <- var_list[[length(var_list)]]
rawdata <- obtain_pathdata(dataset, group, SubDataset, Cluster)
CorCol.name <- c('id', group_sample.data$Samples, 'ko_name', 'ko_des')
landmark_info <- unique(rawdata[CorCol.name])
landmark <- merge(node_list, landmark_info, by = 'id')
landmark$Mean <- apply(landmark[group_sample.data$Samples], 1, mean)
landmark.ordered <- landmark[order(landmark$Mean, decreasing = TRUE),]
return(landmark.ordered)
}
# setting parameters
dataset = 'LY_9samples_metatranscriptomics'
group = 'Yellow_vs_Dark'
SubDataset <- 'YvsD_SubCell_dgi'
var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
p <- Clustering(dataset, group, SubDataset)
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'cowplot'
##
##
## The following object is masked from 'package:lubridate':
##
## stamp
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p
# --- load functional genes ---
library(dplyr)
# setting parameters
dataset = 'LY_9samples_metatranscriptomics'
group = 'Yellow_vs_Dark'
SubDataset <- 'YvsD_SubCell_dgi'
var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
GeneSet_bycluster <- list('Cluster 1', 'Cluster 2','Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7')
GeneSet_byswiss <- list(c('nitrate', 'nitrite'))
GeneSet_bykegg <- list(c('Phototransduction'))
# Annotation(dataset, group, SubDataset, GeneSet_bykegg,
# GeneSet_byswiss=0,
# GeneSet_bycluster=0)
Annotation(dataset, group, SubDataset, GeneSet_bykegg=0,
GeneSet_byswiss=0,
GeneSet_bycluster)
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
## [1] "The total gene number of Cluster 1 is: 512"
## [1] "The total gene number of Cluster 2 is: 353"
## [1] "The total gene number of Cluster 3 is: 26"
## [1] "The total gene number of Cluster 4 is: 120"
## [1] "The total gene number of Cluster 5 is: 783"
## [1] "The total gene number of Cluster 6 is: 80"
## [1] "The total gene number of Cluster 7 is: 131"
# rm(list=ls())
# --- setting parameters ---
# *dataset
dataset = 'LY_9samples_metatranscriptomics'
group = 'Yellow_vs_Dark'
SubDataset <- 'YvsD_SubCell_dgi'
Cluster = 'Cluster 3'
# *set basic data for p-value and other statistic parameters calculations
ref_group <- 'Dark'
group_sample <- 'group_Yellow'
group_sample.path <- paste('./data/', dataset, '/inputdata/', group_sample, '.csv', sep = '')
group_sample.data <- read.csv(group_sample.path)
annotation.level <- 'level3_pathway_name'
# === pvalue and statistic analysis ===
rawdata <- obtain_pathdata(dataset, group, SubDataset,
Cluster)
## [1] "The samples and groups are as follows:"
## [1] "Blue1.x" "Blue2.x" "Blue3.x" "Dark1" "Dark2" "Dark3" "Yellow1"
## [8] "Yellow2" "Yellow3"
## Samples Group
## 1 Dark1 Dark
## 2 Dark2 Dark
## 3 Dark3 Dark
## 4 Blue1 Blue
## 5 Blue2 Blue
## 6 Blue3 Blue
## 7 Yellow1 Yellow
## 8 Yellow2 Yellow
## 9 Yellow3 Yellow
# *setting for visualization app
fig.name <- paste(Cluster, ' of ', group_sample, sep = '')
# --- obtain expression level by pathway ---
library(dplyr)
pathway <- rawdata %>%
group_by(level3_pathway_name) %>%
summarise(Dark1mean = mean(Dark1),
Dark2mean = mean(Dark2),
Dark3mean = mean(Dark3),
Yellow1mean = mean(Yellow1),
Yellow2mean = mean(Yellow2),
Yellow3mean = mean(Yellow3))
# --- obtain p-value ---
df.pathlevel_p <- obtain_pvalue(dataset,ref_group, group_sample.data, pathway, annotation.level)
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:cowplot':
##
## get_legend
# --- obtain other statistic parameters ---
df.pathlevel_p <- ObtainStatistic(pathway, df.pathlevel_p, group_sample.data)
# *set threshod for filtering
p_thre <- 0.2
FC_up <- 2
FC_down <- 0.5
Expre <- 10
# --- conduct enrichment analysis ---
fig <- enrich(df.pathlevel_p, SubDataset, dataset, Cluster,
p_thre, FC_up, FC_down, Expre,
group_sample.data)
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [1] "File ./data/LY_9samples_metatranscriptomics/output_data/R/YvsD_SubCell_dgi_Cluster 3_Enrichment.csv Exist."
## Warning: ggrepel: 23 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
fig
## Warning: ggrepel: 29 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
To conduct gene panels comparison, enrichment analysis of targeted panel is a prerequisite. Please make sure to have used APP2 for enrichment analysis with proper threshold.
In default, all the clusters are compared.
In this case, it can be observed clearly that cluster 3 is the hub gene panel with high expression level.
# setting parameters
dataset = 'LY_9samples_metatranscriptomics'
group = 'Yellow_vs_Dark'
SubDataset <- 'YvsD_SubCell_dgi'
p <- GPsEnrichCompare(dataset, group, SubDataset, k=7)
## Warning: ggrepel: 130 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p
## Warning: ggrepel: 136 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
From app3, we can identify the Hub Gene Panels (HGPs) and Signaling Gene Panels (HGPs).
To better regulate the microbiome, we construct regulatory network, also called interaction network, on the genes level based on microbial genetic topology.
The occor.r and graph_list can also be utilized for visualization in Gephi or R.
# setting parameters
dataset <- 'LY_9samples_metatranscriptomics'
group <- 'Yellow_vs_Dark'
SubDataset <- 'YvsD_SubCell_dgi'
# *set basic data for statistic parameters calculations
ref_group <- 'Dark'
group_sample <- 'group_Yellow'
group_sample.path <- paste('./data/', dataset, '/inputdata/', group_sample, '.csv', sep = '')
group_sample.data <- read.csv(group_sample.path)
# --- gene panel setting ---
# *setting for gene searching: Obtain the targeted genes and corresponding gene panel
# method 1: manually set interested gene panel (cluster)
Cluster <- 'Cluster 3' # HGP
# method 2: search the gene panel that interested gene or pathway assigned to
interested.gene <- 'nitrite'
interested.pathway <- 'Phototransduction'
SearchGene <- TRUE
SearchPathway <- FALSE
'
option: level3_pathway_name, ko_des, SwissProt_Description
'
## [1] "\noption: level3_pathway_name, ko_des, SwissProt_Description\n"
Cluster <- Search_GePa(var_list, SearchGene=TRUE, SearchPathway=FALSE)
## [1] "Most of nitrite related gene belong to:"
## Cluster 4
## 5
## [1] "The cluster distribution of nitrite is:"
##
## Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6
## 3 2 5 1 1
# === obtain correlation matrix: occor.r ===
occor.r <- ObtainCorNet(dataset, SubDataset, Cluster, group, group_sample.data, gene_name=TRUE, thre.p = 0.05, thre.r = 0.8)
## [1] "The samples and groups are as follows:"
## [1] "Blue1.x" "Blue2.x" "Blue3.x" "Dark1" "Dark2" "Dark3" "Yellow1"
## [8] "Yellow2" "Yellow3"
## Samples Group
## 1 Dark1 Dark
## 2 Dark2 Dark
## 3 Dark3 Dark
## 4 Blue1 Blue
## 5 Blue2 Blue
## 6 Blue3 Blue
## 7 Yellow1 Yellow
## 8 Yellow2 Yellow
## 9 Yellow3 Yellow
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## [1] "File ./data/LY_9samples_metatranscriptomics/output_data/R/YvsD_SubCell_dgi_Cluster 4_p0.05_r0.8_CorNet.csv exists."
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:lubridate':
##
## %--%, union
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
# parameter setting*
ExploreDegree = FALSE
ExploreNeibor = FALSE
adjacency_weight <- occor.r
igraph <- graph_from_adjacency_matrix(as.matrix(adjacency_weight), mode = 'undirected', weighted = TRUE, diag = FALSE)
# === obtain graph: graph_list ===
graph_list <- ObtainGraph(dataset, SubDataset, Cluster, group, group_sample.data, igraph, gene_name=TRUE, thre.p = 0.05, thre.r = 0.8)
## [1] "The total nodes number is: 22"
# --- obtain landmark genes ----
landmark.ordered <- obtain_landmark(graph_list, dataset, group, SubDataset,
Cluster, group_sample.data)
## [1] "The samples and groups are as follows:"
## [1] "Blue1.x" "Blue2.x" "Blue3.x" "Dark1" "Dark2" "Dark3" "Yellow1"
## [8] "Yellow2" "Yellow3"
## Samples Group
## 1 Dark1 Dark
## 2 Dark2 Dark
## 3 Dark3 Dark
## 4 Blue1 Blue
## 5 Blue2 Blue
## 6 Blue3 Blue
## 7 Yellow1 Yellow
## 8 Yellow2 Yellow
## 9 Yellow3 Yellow
landmark.top5 <- landmark.ordered$ko_name[1:5]
print(paste('The top 5 landmark genes of', Cluster, 'is:'))
## [1] "The top 5 landmark genes of Cluster 4 is:"
print(landmark.top5)
## [1] NA NA NA NA NA
# 设置不同type的边颜色不同
pale8 <- c("#8DD3C7FF","#FFFFB3FF","#BEBADAFF","#FB8072FF","#80B1D3FF","#FDB462FF","#B3DE69FF","#FCCDE5FF")
E(igraph)$color <- "#FB8072FF"
index <- E(igraph)$type == "mention"
E(igraph)$color <- '#8DD3C7FF'
plot(igraph,
vertex.color = '#BEBADAFF', edge.color = E(igraph)$color,
layout = layout_in_circle)
# cfg <- cluster_fast_greedy(igraph)
# plot(cfg, igraph)
# setting parameters
dataset = 'LY_9samples_metatranscriptomics'
group = 'Blue_vs_Dark'
SubDataset <- 'BvsD_SubCell_dgi'
var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
p <- Clustering(dataset, group, SubDataset)
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
p
# --- load functional genes ---
library(dplyr)
# setting parameters
dataset = 'LY_9samples_metatranscriptomics'
group = 'Blue_vs_Dark'
SubDataset <- 'BvsD_SubCell_dgi'
var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
GeneSet_bycluster <- list('Cluster 1', 'Cluster 2','Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7')
GeneSet_byswiss <- list(c('nitrate', 'nitrite'))
GeneSet_bykegg <- list(c('Phototransduction'))
# Annotation(dataset, group, SubDataset, GeneSet_bykegg,
# GeneSet_byswiss=0,
# GeneSet_bycluster=0)
Annotation(dataset, group, SubDataset, GeneSet_bykegg=0,
GeneSet_byswiss=0,
GeneSet_bycluster)
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
## [1] "The total gene number of Cluster 1 is: 1838"
## [1] "The total gene number of Cluster 2 is: 767"
## [1] "The total gene number of Cluster 3 is: 4180"
## [1] "The total gene number of Cluster 4 is: 394"
## [1] "The total gene number of Cluster 5 is: 2105"
## [1] "The total gene number of Cluster 6 is: 899"
## [1] "The total gene number of Cluster 7 is: 744"
# The output are all unique genes
Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6 Cluster 7 4828 1795 9533 1273 4198 2061 1589
# rm(list=ls())
# --- setting parameters ---
# *dataset
dataset = 'LY_9samples_metatranscriptomics'
group = 'Blue_vs_Dark'
SubDataset <- 'BvsD_SubCell_dgi'
Cluster = 'Cluster 7'
# *set basic data for p-value and other statistic parameters calculations
ref_group <- 'Dark'
group_sample <- 'group_Blue'
group_sample.path <- paste('./data/', dataset, '/inputdata/', group_sample, '.csv', sep = '')
group_sample.data <- read.csv(group_sample.path)
annotation.level <- 'level3_pathway_name'
# === pvalue and statistic analysis ===
rawdata <- obtain_pathdata(dataset, group, SubDataset,
Cluster)
## [1] "The samples and groups are as follows:"
## [1] "Blue1" "Blue2" "Blue3" "Dark1" "Dark2" "Dark3"
## [7] "Yellow1.x" "Yellow2.x" "Yellow3.x"
## Samples Group
## 1 Dark1 Dark
## 2 Dark2 Dark
## 3 Dark3 Dark
## 4 Blue1 Blue
## 5 Blue2 Blue
## 6 Blue3 Blue
## 7 Yellow1 Yellow
## 8 Yellow2 Yellow
## 9 Yellow3 Yellow
# *setting for visualization app
fig.name <- paste(Cluster, ' of ', group_sample, sep = '')
# --- obtain expression level by pathway ---
library(dplyr)
pathway <- rawdata %>%
group_by(level3_pathway_name) %>%
summarise(Dark1mean = mean(Dark1),
Dark2mean = mean(Dark2),
Dark3mean = mean(Dark3),
Blue1mean = mean(Blue1),
Blue2mean = mean(Blue2),
Blue3mean = mean(Blue3))
# --- obtain p-value ---
df.pathlevel_p <- obtain_pvalue(dataset,ref_group, group_sample.data, pathway, annotation.level)
# --- obtain other statistic parameters ---
df.pathlevel_p <- ObtainStatistic(pathway, df.pathlevel_p, group_sample.data)
# *set threshod for filtering
if (Cluster == 'Cluster 4'){
p_thre <- 0.05
FC_up <- 2
FC_down <- 0.2
Expre <- 20
} else if (Cluster == 'Cluster 5'){
p_thre <- 0.001
FC_up <- 10
FC_down <- 0.5
Expre <- 10
} else if (Cluster == 'Cluster 7'){
p_thre <- 0.01
FC_up <- 2
FC_down <- 0.5
Expre <- 10
} else if (Cluster == 'Cluster 2'){
p_thre <- 0.001
FC_up <- 2
FC_down <- 0.5
Expre <- 10
} else if (Cluster == 'Cluster 3'){
p_thre <- 0.01
FC_up <- 2
FC_down <- 0.5
Expre <- 5.5
} else if (Cluster == 'Cluster 1'){
p_thre <- 0.01
FC_up <- 2
FC_down <- 0.5
Expre <- 1.5
} else if (Cluster == 'Cluster 6'){
p_thre <- 0.01
FC_up <- 2
FC_down <- 0.5
Expre <- 10
}
# --- conduct enrichment analysis ---
fig <- enrich(df.pathlevel_p, SubDataset, dataset, Cluster,
p_thre, FC_up, FC_down, Expre,
group_sample.data)
## [1] "File ./data/LY_9samples_metatranscriptomics/output_data/R/BvsD_SubCell_dgi_Cluster 7_Enrichment.csv Exist."
fig
To conduct gene panels comparison, enrichment analysis of targeted panel is a prerequisite. Please make sure to have used APP2 for enrichment analysis with proper threshold.
In default, all the clusters are compared.
In this case, it can be observed clearly that cluster 3 is the hub gene panel with high expression level.
# setting parameters
dataset = 'LY_9samples_metatranscriptomics'
group = 'Blue_vs_Dark'
SubDataset <- 'BvsD_SubCell_dgi'
p <- GPsEnrichCompare(dataset, group, SubDataset, k=7)
## Warning: ggrepel: 100 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p
## Warning: ggrepel: 101 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# setting parameters
dataset = 'LY_9samples_metatranscriptomics'
group = 'Blue_vs_Dark'
SubDataset <- 'BvsD_SubCell_dgi'
# *set basic data for statistic parameters calculations
ref_group <- 'Dark'
group_sample <- 'group_Blue'
group_sample.path <- paste('./data/', dataset, '/inputdata/', group_sample, '.csv', sep = '')
group_sample.data <- read.csv(group_sample.path)
# --- gene panel setting ---
# *setting for gene searching: Obtain the targeted genes and corresponding gene panel
# method 1: manually set interested gene panel (cluster)
Cluster <- 'Cluster 3' # HGP
# method 2: search the gene panel that interested gene or pathway assigned to
interested.gene <- 'nitrite'
interested.pathway <- 'Phototransduction'
SearchGene <- TRUE
SearchPathway <- FALSE
'
option: level3_pathway_name, ko_des, SwissProt_Description
'
## [1] "\noption: level3_pathway_name, ko_des, SwissProt_Description\n"
Cluster <- Search_GePa(var_list, SearchGene=TRUE, SearchPathway=FALSE)
## [1] "Most of nitrite related gene belong to:"
## Cluster 3
## 31
## [1] "The cluster distribution of nitrite is:"
##
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6 Cluster 7
## 16 12 31 9 6 5 9
# === obtain correlation matrix: occor.r ===
occor.r <- ObtainCorNet(dataset, SubDataset, Cluster, group, group_sample.data, gene_name=FALSE, thre.p = 0.05, thre.r = 0.8)
## [1] "The samples and groups are as follows:"
## [1] "Blue1" "Blue2" "Blue3" "Dark1" "Dark2" "Dark3"
## [7] "Yellow1.x" "Yellow2.x" "Yellow3.x"
## Samples Group
## 1 Dark1 Dark
## 2 Dark2 Dark
## 3 Dark3 Dark
## 4 Blue1 Blue
## 5 Blue2 Blue
## 6 Blue3 Blue
## 7 Yellow1 Yellow
## 8 Yellow2 Yellow
## 9 Yellow3 Yellow
## [1] "File ./data/LY_9samples_metatranscriptomics/output_data/R/BvsD_SubCell_dgi_Cluster 3_p0.05_r0.8_CorNet.csv exists."
library(igraph)
# parameter setting*
ExploreDegree = FALSE
ExploreNeibor = FALSE
adjacency_weight <- occor.r
igraph <- graph_from_adjacency_matrix(as.matrix(adjacency_weight), mode = 'undirected', weighted = TRUE, diag = FALSE)
# === obtain graph: graph_list ===
graph_list <- ObtainGraph(dataset, SubDataset, Cluster, group, group_sample.data, igraph, gene_name=FALSE, thre.p = 0.05, thre.r = 0.8)
## [1] "The total nodes number is: 470"
# --- obtain landmark genes ----
landmark.ordered <- obtain_landmark(graph_list, dataset, group, SubDataset,
Cluster, group_sample.data)
## [1] "The samples and groups are as follows:"
## [1] "Blue1" "Blue2" "Blue3" "Dark1" "Dark2" "Dark3"
## [7] "Yellow1.x" "Yellow2.x" "Yellow3.x"
## Samples Group
## 1 Dark1 Dark
## 2 Dark2 Dark
## 3 Dark3 Dark
## 4 Blue1 Blue
## 5 Blue2 Blue
## 6 Blue3 Blue
## 7 Yellow1 Yellow
## 8 Yellow2 Yellow
## 9 Yellow3 Yellow
landmark.top5 <- landmark.ordered$ko_name[1:5]
print(paste('The top 5 landmark genes of', Cluster, 'is:'))
## [1] "The top 5 landmark genes of Cluster 3 is:"
print(landmark.top5)
## [1] "fusA, GFM, EFG" "folD" "E3.1.11.2, xthA" "fusA, GFM, EFG"
## [5] "trpB"
# 设置不同type的边颜色不同
pale8 <- c("#8DD3C7FF","#FFFFB3FF","#BEBADAFF","#FB8072FF","#80B1D3FF","#FDB462FF","#B3DE69FF","#FCCDE5FF")
E(igraph)$color <- "#FB8072FF"
index <- E(igraph)$type == "mention"
E(igraph)$color <- '#8DD3C7FF'
plot(igraph,
vertex.color = '#BEBADAFF', edge.color = E(igraph)$color,
layout = layout_in_circle)
cfg <- cluster_fast_greedy(igraph)
plot(cfg, igraph)
The gene panels are obtained after 20000 epochs
feature shape: (5, 57)
cuda is not available
Threshold: 4
Links number: 541
Average Links: 9.491228070175438
Adj: (57, 57) Edges: 541
X: (57, 5)
Result; SCI score (Clustering quality) is: 0.59939337
Tips: increase threshold will bring about more edges, that more information can be passed to the neighboring nodes, while the calculation time will also be longer and probably lead to over-fitting.
# rm(list=ls())
# --- load functional genes ---
library(dplyr)
# setting parameters
dataset = 'LY_15samples_metagenomics'
group = 'Yellow_vs_Dark'
SubDataset <- 'Yellow_vs_Dark'
var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
p <- Clustering(dataset, group, SubDataset)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
p
[1] "The total gene number of Cluster 1 is: 7"
[1] "The total gene number of Cluster 2 is: 9"
[1] "The total gene number of Cluster 3 is: 3"
[1] "The total gene number of Cluster 4 is: 7"
[1] "The total gene number of Cluster 5 is: 15"
[1] "The total gene number of Cluster 6 is: 8"
[1] "The total gene number of Cluster 7 is: 8"
# rm(list=ls())
dataset = 'LY_15samples_metagenomics'
group = 'Yellow_vs_Dark'
SubDataset <- 'Yellow_vs_Dark'
var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
GeneSet_bycluster <- list('Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7')
# GeneSet_byswiss <- list(c('nitrate', 'nitrite'))
# GeneSet_bykegg <- list(c('Phototransduction'))
# Annotation(dataset, group, SubDataset, GeneSet_bykegg,
# GeneSet_byswiss=0,
# GeneSet_bycluster=0)
Annotation(dataset, group, SubDataset, GeneSet_bykegg=0,
GeneSet_byswiss=0,
GeneSet_bycluster)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
## [1] "The total gene number of Cluster 1 is: 7"
## [1] "The total gene number of Cluster 2 is: 9"
## [1] "The total gene number of Cluster 3 is: 3"
## [1] "The total gene number of Cluster 4 is: 7"
## [1] "The total gene number of Cluster 5 is: 15"
## [1] "The total gene number of Cluster 6 is: 8"
## [1] "The total gene number of Cluster 7 is: 8"
# rm(list=ls())
# --- setting parameters ---
# *dataset
dataset = 'LY_15samples_metagenomics'
group = 'Yellow_vs_Dark'
SubDataset <- 'Yellow_vs_Dark'
Cluster = 'Cluster 5'
# *set basic data for p-value and other statistic parameters calculations
ref_group <- 'Dark'
group_sample <- 'group_Yellow'
group_sample.path <- paste('./data/', dataset, '/inputdata/', group_sample, '.csv', sep = '')
group_sample.data <- read.csv(group_sample.path)
annotation.level <- 'level3_pathway_name'
# === pvalue and statistic analysis ===
rawdata <- obtain_pathdata(dataset, group, SubDataset,
Cluster)
## [1] "The samples and groups are as follows:"
## [1] "G3A" "G2A" "R3A" "D2A" "Y1A" "Y2A" "B2A" "R1A" "D3A" "B1A" "Y3A" "B3A"
## [13] "D1A" "G1A" "R2A"
## Samples Group
## 1 D2A Dark
## 2 D3A Dark
## 3 D1A Dark
## 4 B2A Blue
## 5 B1A Blue
## 6 B3A Blue
## 7 G3A Green
## 8 G2A Green
## 9 G1A Green
## 10 R3A Red
## 11 R1A Red
## 12 R2A Red
## 13 Y1A Yellow
## 14 Y2A Yellow
## 15 Y3A Yellow
# *setting for visualization app
fig.name <- paste('HGP of ', group_sample, sep = '')
# --- obtain expression level by pathway ---
library(dplyr)
pathway <- rawdata %>%
group_by(level3_pathway_name) %>%
summarise(Dark1mean = mean(D1A),
Dark2mean = mean(D2A),
Dark3mean = mean(D3A),
Yellow1mean = mean(Y1A),
Yellow2mean = mean(Y2A),
Yellow3mean = mean(Y3A))
# --- obtain p-value ---
df.pathlevel_p <- obtain_pvalue(dataset,ref_group, group_sample.data, pathway, annotation.level)
# --- obtain other statistic parameters ---
df.pathlevel_p <- ObtainStatistic(pathway, df.pathlevel_p, group_sample.data)
# *set threshod for filtering
p_thre <- 0.275
FC_up <- 1
FC_down <- 1
Expre <- 0
# --- conduct enrichment analysis ---
fig <- enrich(df.pathlevel_p, SubDataset, dataset, Cluster,
p_thre, FC_up, FC_down, Expre,
group_sample.data)
## [1] "File ./data/LY_15samples_metagenomics/output_data/R/Yellow_vs_Dark_Cluster 5_Enrichment.csv Exist."
fig
feature shape: (5, 88)
cuda is not available
Threshold: 2
Links number: 1040
Average Links: 11.818181818181818
Adj: (88, 88) Edges: 1040
X: (88, 5)
# --- load functional genes ---
library(dplyr)
# setting parameters
dataset = 'LY_15samples_metagenomics'
group = 'Blue_vs_Dark'
SubDataset <- 'Blue_vs_Dark'
var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
p <- Clustering(dataset, group, SubDataset)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
p
[1] "The total gene number of Cluster 1 is: 19"
[1] "The total gene number of Cluster 2 is: 10"
[1] "The total gene number of Cluster 3 is: 7"
[1] "The total gene number of Cluster 4 is: 20"
[1] "The total gene number of Cluster 5 is: 8"
[1] "The total gene number of Cluster 6 is: 11"
[1] "The total gene number of Cluster 7 is: 13"
# setting parameters
dataset = 'LY_15samples_metagenomics'
group = 'Blue_vs_Dark'
SubDataset <- 'Blue_vs_Dark'
var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
GeneSet_bycluster <- list('Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7')
# GeneSet_byswiss <- list(c('nitrate', 'nitrite'))
# GeneSet_bykegg <- list(c('Phototransduction'))
# Annotation(dataset, group, SubDataset, GeneSet_bykegg,
# GeneSet_byswiss=0,
# GeneSet_bycluster=0)
Annotation(dataset, group, SubDataset, GeneSet_bykegg=0,
GeneSet_byswiss=0,
GeneSet_bycluster)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
## [1] "The total gene number of Cluster 1 is: 19"
## [1] "The total gene number of Cluster 2 is: 10"
## [1] "The total gene number of Cluster 3 is: 7"
## [1] "The total gene number of Cluster 4 is: 20"
## [1] "The total gene number of Cluster 5 is: 8"
## [1] "The total gene number of Cluster 6 is: 11"
## [1] "The total gene number of Cluster 7 is: 13"
# rm(list=ls())
# --- setting parameters ---
# *dataset
dataset = 'LY_15samples_metagenomics'
group = 'Blue_vs_Dark'
SubDataset <- 'Blue_vs_Dark'
Cluster = 'Cluster 4' # 410 genes
# *set basic data for p-value and other statistic parameters calculations
ref_group <- 'Dark'
group_sample <- 'group_Blue'
group_sample.path <- paste('./data/', dataset, '/inputdata/', group_sample, '.csv', sep = '')
group_sample.data <- read.csv(group_sample.path)
annotation.level <- 'level3_pathway_name'
# === pvalue and statistic analysis ===
rawdata <- obtain_pathdata(dataset, group, SubDataset,
Cluster)
## [1] "The samples and groups are as follows:"
## [1] "G3A" "G2A" "R3A" "D2A" "Y1A" "Y2A" "B2A" "R1A" "D3A" "B1A" "Y3A" "B3A"
## [13] "D1A" "G1A" "R2A"
## Samples Group
## 1 D2A Dark
## 2 D3A Dark
## 3 D1A Dark
## 4 B2A Blue
## 5 B1A Blue
## 6 B3A Blue
## 7 G3A Green
## 8 G2A Green
## 9 G1A Green
## 10 R3A Red
## 11 R1A Red
## 12 R2A Red
## 13 Y1A Yellow
## 14 Y2A Yellow
## 15 Y3A Yellow
# *setting for visualization app
fig.name <- paste('HGP of ', group_sample, sep = '')
# --- obtain expression level by pathway ---
library(dplyr)
pathway <- rawdata %>%
group_by(level3_pathway_name) %>%
summarise(Dark1mean = mean(D1A),
Dark2mean = mean(D2A),
Dark3mean = mean(D3A),
Blue1mean = mean(B1A),
Blue2mean = mean(B2A),
Blue3mean = mean(B3A))
# --- obtain p-value ---
df.pathlevel_p <- obtain_pvalue(dataset,ref_group, group_sample.data, pathway, annotation.level)
# --- obtain other statistic parameters ---
df.pathlevel_p <- ObtainStatistic(pathway, df.pathlevel_p, group_sample.data)
# *set threshod for filtering
p_thre <- 0.275
FC_up <- 1
FC_down <- 1
Expre <- 0
# --- conduct enrichment analysis ---
fig <- enrich(df.pathlevel_p, SubDataset, dataset, Cluster,
p_thre, FC_up, FC_down, Expre,
group_sample.data)
## [1] "File ./data/LY_15samples_metagenomics/output_data/R/Blue_vs_Dark_Cluster 4_Enrichment.csv Exist."
fig
dataset_name = ‘ZJ_12samplesEES_metagenomics’
group = ‘NHQ_vs_Control’
feature shape: (5, 964)
cuda is not available
Threshold: 1
Links number: 12492
Average Links: 12.95850622406639
Adj: (964, 964) Edges: 12492
X: (964, 5)
# rm(list=ls())
# setting parameters
dataset = 'ZJ_12samplesEES_metagenomics'
group = 'NHQ_vs_Control'
SubDataset <- 'NHQ_vs_Control'
var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/ZJ_12samplesEES_metagenomics/inputdata/graphdata_NHQ_vs_Controllabeled.csv exists."
p <- Clustering(dataset, group, SubDataset)
## [1] "File ./data/ZJ_12samplesEES_metagenomics/inputdata/graphdata_NHQ_vs_Controllabeled.csv exists."
p
Output
[1] "The total gene number of Cluster 1 is: 213"
[1] "The total gene number of Cluster 2 is: 64"
[1] "The total gene number of Cluster 3 is: 58"
[1] "The total gene number of Cluster 4 is: 410"
[1] "The total gene number of Cluster 5 is: 8"
[1] "The total gene number of Cluster 6 is: 3"
[1] "The total gene number of Cluster 7 is: 15"# setting parameters
dataset = 'ZJ_12samplesEES_metagenomics'
group = 'NHQ_vs_Control'
SubDataset <- 'NHQ_vs_Control'
var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/ZJ_12samplesEES_metagenomics/inputdata/graphdata_NHQ_vs_Controllabeled.csv exists."
GeneSet_bycluster <- list('Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7')
# GeneSet_byswiss <- list(c('nitrate', 'nitrite'))
# GeneSet_bykegg <- list(c('Phototransduction'))
# Annotation(dataset, group, SubDataset, GeneSet_bykegg,
# GeneSet_byswiss=0,
# GeneSet_bycluster=0)
Annotation(dataset, group, SubDataset, GeneSet_bykegg=0,
GeneSet_byswiss=0,
GeneSet_bycluster)
## [1] "File ./data/ZJ_12samplesEES_metagenomics/inputdata/graphdata_NHQ_vs_Controllabeled.csv exists."
## [1] "The total gene number of Cluster 1 is: 213"
## [1] "The total gene number of Cluster 2 is: 64"
## [1] "The total gene number of Cluster 3 is: 58"
## [1] "The total gene number of Cluster 4 is: 410"
## [1] "The total gene number of Cluster 5 is: 8"
## [1] "The total gene number of Cluster 6 is: 3"
## [1] "The total gene number of Cluster 7 is: 15"
Obtain Pathway
group_by, summarise: name need to be set based on the samples’ name
# rm(list=ls())
# --- setting parameters ---
# *dataset
dataset <- 'ZJ_12samplesEES_metagenomics'
group = 'NHQ_vs_Control'
SubDataset <- 'NHQ_vs_Control'
Cluster = 'Cluster 4' # 410 genes
# *set basic data for p-value and other statistic parameters calculations
ref_group <- 'Control'
group_sample <- 'group_NHQ'
group_sample.path <- paste('./data/', dataset, '/inputdata/', group_sample, '.csv', sep = '')
group_sample.data <- read.csv(group_sample.path)
annotation.level <- 'level3_pathway_name'
# === pvalue and statistic analysis ===
rawdata <- obtain_pathdata(dataset, group, SubDataset,
Cluster)
## [1] "The samples and groups are as follows:"
## [1] "z9_1" "z10_3" "z10_1" "z9_2" "z11_2" "z11_3" "z10_2" "z11_1" "z1_3"
## [10] "z9_3" "z1_1" "z1_2"
## Samples Group
## 1 z1_3 Control
## 2 z1_1 Control
## 3 z1_2 Control
## 4 z11_2 MV
## 5 z11_3 MV
## 6 z11_1 MV
## 7 z10_3 NHQ
## 8 z10_1 NHQ
## 9 z10_2 NHQ
## 10 z9_1 NR
## 11 z9_2 NR
## 12 z9_3 NR
# *setting for visualization app
fig.name <- paste('HGP of ', group_sample, sep = '')
# --- obtain expression level by pathway ---
library(dplyr)
pathway <- rawdata %>%
group_by(level3_pathway_name) %>%
summarise(Control1mean = mean(z1_1),
Control2mean = mean(z1_2),
Control3mean = mean(z1_3),
NHQ1mean = mean(z10_1),
NHQ2mean = mean(z10_2),
NHQ3mean = mean(z10_3))
# --- obtain p-value ---
df.pathlevel_p <- obtain_pvalue(dataset,ref_group, group_sample.data, pathway, annotation.level)
# --- obtain other statistic parameters ---
df.pathlevel_p <- ObtainStatistic(pathway, df.pathlevel_p, group_sample.data)
# *set threshod for filtering
p_thre <- 0.275
FC_up <- 1
FC_down <- 1
Expre <- 1
# --- conduct enrichment analysis ---
fig <- enrich(df.pathlevel_p, SubDataset, dataset, Cluster,
p_thre, FC_up, FC_down, Expre,
group_sample.data)
## [1] "File ./data/ZJ_12samplesEES_metagenomics/output_data/R/NHQ_vs_Control_Cluster 4_Enrichment.csv Exist."
fig