Define function & basic setting

rm(list=ls()) 

VisualizationSet

library(Rtsne)
library(ggplot2)
library(RColorBrewer)
library(paletteer)

# theme
top.mar=0.1
right.mar=0.1
bottom.mar=0.1
left.mar=0.1
mytheme <- theme_classic() +
  theme(plot.title = element_text(hjust = 0.5,size = 18, face = 'bold'),
        axis.text = element_text(size = 14),
        axis.title = element_text(size = 16, face = 'bold'),
        axis.text.x = element_text(size = 14, vjust = 0.5), # vjust = -0.001
        legend.text = element_text(size = 12), 
        legend.title = element_text(size = 16),
        legend.position = 'none', 
        plot.margin=unit(x=c(top.mar,right.mar,bottom.mar,left.mar),
                         units="inches"))

mytheme1 <- theme_classic() +
  theme(plot.title = element_text(hjust = 0.5,size = 18, face = 'bold'),
        axis.text = element_text(size = 10),
        axis.title = element_text(size = 14, face = 'bold'),
        axis.text.x = element_text(size = 10, vjust = 0.5), # vjust = -0.001
        legend.text = element_text(size = 12), 
        # legend.title = element_blank(),
        legend.title = element_text(size = 16),
        legend.position = 'right', 
        plot.margin=unit(x=c(top.mar,right.mar,bottom.mar,left.mar),
                         units="inches"))

# pallete 
Pale_51 <- as.vector(paletteer_d('ggsci::default_igv'))
pale_9_1 <- as.vector(paletteer_d('ggprism::pastels'))
pale_20_2 <- as.vector(paletteer_d('ggthemes::Tableau_20'))
pale_11_2 <- as.vector(paletteer_d('khroma::sunset'))
pale_12_2 <- as.vector(paletteer_d('RColorBrewer::Set3'))
pale_8_1 <- as.vector(paletteer_d('RColorBrewer::Pastel2'))
pale_29 <- c(pale_20_2, pale_9_1)
pale_28 <- c(pale_8_1, pale_11_2, pale_9_1)
pale_31 <- c(pale_8_1, pale_11_2, pale_12_2)
pale_32 <- c(pale_12_2, pale_20_2)
pale_25 <- c(pale_12_2, pale_11_2, '#85A4FE', 'grey')
pale_29 <- c(pale_25, pale_8_1)

pale_10 <- as.vector(paletteer_d('ggsci::default_jco'))
pale_51 <- as.vector(paletteer_d('ggsci::default_igv'))
cluster_pale <- c('#8DD3C7FF','#BEBADAFF',"#FB8072FF", "#FDB462FF")

basic_var(): obtain basic variables

Basic variables:

  • label_path,

  • output_path

  • graphdata.path

  • fig.path

  • kegg_anno.path

  • Sub_Swiss.path,

  • graphdata,

  • labels,

  • kegg_anno,

  • Sub_Swiss,

  • matched_swiss,

  • matched_kegg

basic_var <- function(dataset, group, SubDataset){
  # --- define path ---
  label_path <- paste("./data/", dataset, '/output_data/', SubDataset, '/generated_data/types.txt', sep = '')
  output_path <- paste("./data/", dataset, "/inputdata/", sep = '') 
  graphdata.path <- paste("./data/", dataset, "/inputdata/graphdata_", group, '.csv', sep = '')
  fig.path <- paste("./data/", dataset, "/output_data/Figures", sep = '') 
  graphdata.labeled.path <- paste("./data/", dataset, "/inputdata/graphdata_", group, 'labeled.csv', sep = '')
  # annotation
  kegg_anno.path <- paste("./data/", dataset, "/rawdata/kegg_annotation_all.csv", sep = '') 
  Sub_Swiss.path <- paste("./data/", dataset, "/inputdata/SubSwissExpre.csv", sep = '') 
  # --- load data ---
  graphdata <- read.csv(graphdata.path, header = TRUE)
  labels <- paste('Cluster',as.vector(apply(read.table(label_path, sep='\t', header = F)+1, 2, as.factor)))
  graphdata$Cluster <- labels
  
  # save labeled data
  if (file.exists(graphdata.labeled.path)){
    print(paste("File", graphdata.labeled.path, "exists."))
  } else {
    write.csv(graphdata, graphdata.labeled.path, row.names = FALSE)
  }
  
  # load kegg annotation
  if (file.exists(kegg_anno.path)){
      kegg_anno <- read.csv(kegg_anno.path)
      matched_kegg <- merge(graphdata[,-(length(graphdata)-2)], kegg_anno, by = 'id')
      Sub_Swiss = NA
      matched_swiss = NA
  } else {
        print('Skipped: lack of kegg annotation data.')
      }
  
  if (file.exists(Sub_Swiss.path)){
    Sub_Swiss <- read.csv(Sub_Swiss.path)
    matched_swiss <- merge(graphdata[-c((length(graphdata)-4):(length(graphdata)-1))], Sub_Swiss, by = 'id')
    matched_kegg <- merge(matched_swiss, kegg_anno, by = 'id')
  }
  
  # return var list 
  var_list <- list(label_path, output_path, graphdata.path, fig.path, kegg_anno.path, 
                   Sub_Swiss.path,
                   graphdata, labels, 
                   kegg_anno, Sub_Swiss,
                   matched_swiss, matched_kegg)
  return(var_list)
}

Clustering()

Clustering <- function(dataset, group, SubDataset){
  
  var_list <- basic_var(dataset, group, SubDataset)
  # # --- define path ---
  # label_path <- var_list[[1]]
  # output_path <- var_list[[2]]
  # graphdata.path <- var_list[[3]]
  # fig.path <- var_list[[4]]
    
  # --- load data ---
  graphdata <- var_list[[7]]
  labels <- var_list[[8]]
  
  # --- visualization ---
  library(tidyverse)
  library(cowplot)
  library(ggplot2)
  p <- ggplot(graphdata, aes(tSNE1,tSNE2, colour = Cluster)) +
    geom_point(size=2, alpha = 0.8) +
    # stat_ellipse(mapping=aes(group = Cluster,colour = Cluster), #分组边界圈
    #              geom = "path", # other: polygon
    #              linetype = 2,
    #              size=0.6,
    #              alpha=0.5) +
    xlab(NULL) +
    ylab(NULL) +
    theme_cowplot() +
    theme(
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      legend.title = element_blank(),
      panel.border = element_blank(),
      axis.line.x = element_line(color = "black", size = 0.5),
      axis.line.y = element_line(color = "black", size = 0.5),
      panel.background = element_blank()
    ) +
    scale_colour_manual(values = pale_25)
    
  fig.path <- paste("./data/", dataset, "/output_data/Figures", sep = '') 
  ggsave(filename = paste('ClusteringLabeled_', group,'.pdf', sep = ''), width = 6, height = 5, units = 'in', path = fig.path)
  return(p)
}

Annotation(): Extract Panels with interested annotation info

  • Problem/Flaw: genes that lack of annotation would be ignored
Annotation <- function(dataset, group, SubDataset, 
                       GeneSet_bykegg=0,
                       GeneSet_byswiss=0, 
                       GeneSet_bycluster=0){
  "
  return: only save data in output path
  "
  # load data
  var_list <- basic_var(dataset, group, SubDataset)
  matched_kegg <- var_list[[length(var_list)]]
  
  # extract and save gene set
  if (GeneSet_bycluster[[1]] != 0){
    # --- extract functional gene set by cluster ---
    # -- cluster 1 --
    for (i in 1:length(GeneSet_bycluster)){
      Cluster <- filter(matched_kegg, matched_kegg$Cluster == GeneSet_bycluster[[i]]) 
      ### save data
      print(paste('The total gene number of', GeneSet_bycluster[[i]], 'is:', length(unique(Cluster$id))) ) # 26 genes in total 
      output_path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', GeneSet_bycluster[[i]], '.csv', sep = '') 
      write.csv(Cluster, output_path, row.names = FALSE)
    }
  } else if (GeneSet_byswiss[[1]] != 0){
    # --- extract functional gene set by gene name ---
    for (i in 1:length(GeneSet_byswiss)){
      for (j in 1:length(GeneSet_byswiss[[i]])){
        result.temp <- matched_kegg[grepl(GeneSet_byswiss[[i]][j], matched_kegg$SwissProt_Description), ] 
        if (j == 1){
          result <- result.temp
        } else{
          result <- rbind(result, result.temp)
        }
      }
      print(paste('The total gene number of', GeneSet_byswiss[[i]], 'is:', length(unique(result$id))) ) 
      output_path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', GeneSet_byswiss[[i]][j], '.csv', sep = '') 
      write.csv(result, output_path, row.names = FALSE)
    }
  } else if (GeneSet_bykegg[[1]] != 0){
    # --- extract functional gene set by gene name ---
    for (i in 1:length(GeneSet_bykegg)){
      for (j in 1:length(GeneSet_bykegg[[i]])){
        result.temp <- matched_kegg[grepl(GeneSet_bykegg[[i]][j], matched_kegg$level3_pathway_name), ] 
        if (j == 1){
          result <- result.temp
        } else{
          result <- rbind(result, result.temp)
        }
      }
      print(paste('The total gene number of', GeneSet_bykegg[[i]], 'is:', length(unique(result$id))) ) # 26 genes in total 
      output_path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', GeneSet_bykegg[[i]][j], '.csv', sep = '') 
      write.csv(result, output_path, row.names = FALSE)
    }
  }
}

obtain_pathdata()

# --- define path ---
obtain_pathdata <- function(dataset, group, SubDataset, 
                            Cluster){
  rpkm.path <- paste('./data/', dataset, "/rawdata/RPKM.txt", sep = '')
  Cluster.path <- paste('./data/', dataset, "/output_data/R/", SubDataset, '_', Cluster, '.csv', sep = '')
  group.path <- paste('./data/', dataset, "/inputdata/group.csv", sep = '')
  kegg_anno.path <- paste('./data/', dataset, "/rawdata/kegg_annotation_all.csv", sep = '') 
  Enrichment.path <- paste('./data/', dataset, '/output_data/R/', SubDataset, '_', Cluster, '_Enrichment.csv', sep = '')
  
  # --- load data ---
  Cluster.data <- read.csv(Cluster.path, header = T)
  rpkm <- read.table(rpkm.path, sep='\t', header = T)
  group <- read.csv(group.path)
  
  # --- combine data ---
  rawdata <- merge(rpkm, Cluster.data[,-c(2:length(rpkm))], by = 'id')
  samples <- colnames(rawdata)[2:length(rpkm)]
  print('The samples and groups are as follows:')
  print(samples)
  print(group)
  return (rawdata)
}

obtain_pvalue()

obtain_pvalue <- function(dataset, ref_group, group_sample.data, pathway, annotation.level){

  # prepare data
  group <- group_sample.data
  temp <- data.frame(t(pathway))
  colnames(temp) <- temp[1,]
  temp <- temp[-1,]
  name <- colnames(temp)
  
  library(ggpubr)
  temp <- data.frame(apply(temp[,1:(length(temp))], 2, as.numeric))
  temp$group <- group$Group
  
  # initial comparison
  compare_data <- temp[, c(1, length(temp))]
  colnames(compare_data) <- c('Pathway', 'group')
  result <- compare_means(Pathway~group, data = compare_data, ref.group = ref_group, 
                          method = 't.test', 
                          paired = FALSE)
  p_value <- data.frame(t(result$p))
  expre_group.temp <- unique(group$Group)
  loc <- match(ref_group, expre_group.temp)
  expre_group <- expre_group.temp[-loc]
  colnames(p_value) <- expre_group
  # loop
  for (i in 2:(length(temp)-1)){
    compare_data <- temp[, c(i, length(temp))]
    colnames(compare_data) <- c('Pathway', 'group')
    result <- compare_means(Pathway ~ group, data = compare_data, ref.group = ref_group, 
                            method = 't.test', 
                            paired = FALSE)
    p <- data.frame(t(result$p))
    colnames(p) <- expre_group
    p_value <- rbind(p_value, p)
  }
  
  rownames(p_value) <- name
  p_value$pathway <- rownames(p_value)
  
  colnames(p_value) <- c(paste('pvalue', colnames(p_value)[1:(length(p_value)-1)], sep = '_'), annotation.level)
  
  df.pathlevel_p <- merge(pathway, p_value, by = annotation.level)
  
  return(df.pathlevel_p)
}

ObtainStatistic()

ObtainStatistic <- function(pathway, df.pathlevel_p, group_sample.data){
  mean <- apply(pathway[,2:(length(pathway)-1)], 1, mean)
  mean_list <- list()
  sd_list <- list()
  for (i in 1:((length(pathway)-1)/3)){
    sd.temp <- as.data.frame(apply(pathway[,(i*3-1):(i*3+1)], 1, sd))
    mean.temp <- as.data.frame(apply(pathway[,(i*3-1):(i*3+1)], 1, mean))
    sd_list[i] <- sd.temp
    mean_list[i] <- mean.temp
  }
  
  for (i in 2:((length(pathway)-1)/3)){
    df.pathlevel_p[SubDataset] <- mean_list[[i]]/mean_list[[1]]
    df.pathlevel_p[unique(group_sample.data$Group)[i]] <- mean_list[[i]]
  }
  return(df.pathlevel_p)
}

last_chara(): for data processing

last_chara <- function(string, chara=';'){
  result <- sub(paste0(".*", chara), "", string)
  return (result)
}

enrich()

enrich <- function(df.pathlevel_p, SubDataset, dataset, Cluster,
                   p_thre, FC_up, FC_down, Expre,
                   group_sample.data){
  Path.filtered <- filter(df.pathlevel_p, (df.pathlevel_p[SubDataset] > FC_up | df.pathlevel_p[SubDataset] < FC_down) & 
                            df.pathlevel_p[colnames(df.pathlevel_p[length(df.pathlevel_p)-2])] < p_thre & 
                            df.pathlevel_p[colnames(df.pathlevel_p[length(df.pathlevel_p)])] > Expre) 
  
  Enrichment.path <- paste('./data/', dataset, '/output_data/R/', SubDataset, '_', Cluster, '_Enrichment.csv', sep = '')
  if (file.exists(Enrichment.path)){
  print(paste('File', Enrichment.path, 'Exist.'))
} else {
  write.csv(Path.filtered, Enrichment.path, row.names = FALSE)
}
  plot_data <- Path.filtered
  kegg_anno <- read.csv(paste("./data/", dataset, "/rawdata/kegg_annotation_all.csv", sep = '')) 
  
  # match to obtain kegglevel2 labels
  plot_data$KEGGBrite2 <- kegg_anno[match(plot_data$level3_pathway_name, kegg_anno$level3_pathway_name),]$level2_pathway_name
  
  # extract the last level (after ';')
  plot_data$kegglevel3 <- apply(plot_data['level3_pathway_name'], 2, last_chara)
  plot_data$kegglevel2 <- apply(plot_data['KEGGBrite2'], 2, last_chara)
  
  # obtain data for visualization
  start_index <- length(group_sample.data$Samples)+2
  data <- plot_data[, c(1, start_index:(start_index+5))]
  data$Expression <- apply(plot_data[,(start_index-3):(start_index-1)], 1, mean)
  
  colnames(data) <- c('level3_pathway_name', 'pvalue', 'FoldChange', 'NHQ_mean', 'KEGGBrite2', 'kegglevel3', 'kegglevel2', 'Expression')
  
  # visualization
  library(ggrepel)
  p <- ggplot(data, aes(x = pvalue, y = FoldChange, size = Expression, color = kegglevel2)) +
    geom_point(alpha=0.7)
  p <- p + geom_text_repel(data = data, 
                      aes(pvalue, y = FoldChange, label = kegglevel3),
                      size=4,color="grey20",
                      point.padding = 0.5,hjust = 0.1,
                      segment.color="grey20",
                      segment.size=0.6,
                      segment.alpha=0.8,
                      nudge_x=0,
                      nudge_y=0, 
                      max.overlaps = 10
                      ) + 
    labs(x = 'P-value', y = 'Fold change', title = fig.name)
  
  fig <- p + scale_size(range = c(2, 20), name="Expression") +
    scale_color_manual(values = pale_20_2) + #'#006C75', #36CFC8',
    mytheme1
  
  fig.path <- paste("./data/", dataset, "/output_data/Figures", sep = '') 
  ggsave(filename = paste('Enrichment_', SubDataset,'.pdf', sep = ''), width = 10, height = 7.5, units = 'in', path = fig.path)
  
  return(fig)
}

GPsEnrichCompare()

GPsEnrichCompare <- function(dataset, group, SubDataset, k=7){
  '
  :para k: the number of cluster
  '
  # labeling gene panels
  plotdata_cluster.list <- list()
  for (i in 1:k){
    Cluster <- paste('Cluster', i)
    cluster_path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', Cluster, '_Enrichment.csv', sep = '') 
    plotdata_cluster.list[[i]] <- read.csv(cluster_path)
    plotdata_cluster.list[[i]]$Cluster <- Cluster
    if (i == 1){
      plotdata_cluster <- as.data.frame(plotdata_cluster.list[[1]])
    } else {
      plotdata_cluster <- rbind(plotdata_cluster, plotdata_cluster.list[[i]])
    }
  }

  # prepare plot data
  plotdata_cluster$Expression <- apply(plotdata_cluster[,7:9], 1, mean)
  plotdata <- plotdata_cluster[,-c(2:7)]
  colnames(plotdata) <- c('level3_pathway_name', 'pvalue', 'FoldChange', group, 'Cluster', 'Expression')
  
  # plot
  library(ggrepel)
  p <- ggplot(plotdata,  
              aes(x = pvalue, y = FoldChange, size = Expression, color = Cluster)) +
    geom_point(alpha=0.7)
  p <- p + geom_text_repel(data = plotdata, 
                           aes(pvalue, y = FoldChange,label = level3_pathway_name),
                           size=4,color="grey20",
                           point.padding = 0.5,hjust = 0.1,
                           segment.color="grey20",
                           segment.size=0.6,
                           segment.alpha=0.8,
                           nudge_x=0,
                           nudge_y=0, 
                           max.overlaps = 11
  ) + 
    labs(x = 'P-value', y = 'Fold change', title = group)

  p <- p + scale_size(range = c(1, 15), name="Expression") +
    scale_color_manual(values = pale_10) +
    mytheme1
  
  # save fig
  fig.path <- paste("./data/", dataset, "/output_data/Figures", sep = '') 
  ggsave(filename = paste('GPsErichComparison_', group,'.pdf', sep = ''), width = 9, height = 8, units = 'in', path = fig.path)
  
  return(p)
}

Search_GePa()

Search_GePa <- function(var_list, SearchGene=TRUE, SearchPathway=FALSE){
  matched_kegg <- var_list[[length(var_list)]]

  if (SearchGene == TRUE){
    annotation <- 'ko_des'
    # grep gene info list
    kodes <- matched_kegg[grepl(interested.gene, matched_kegg[annotation][,1]), ] 
    
    # extract cluster 
    gene.clu.distri <- table(kodes$Cluster)
    print(paste('Most of', interested.gene, 'related gene belong to:'))
    print(gene.clu.distri[grepl(max(gene.clu.distri), gene.clu.distri)])
    print(paste('The cluster distribution of', interested.gene, 'is:'))
    print(gene.clu.distri)
    
    # obtain cluster 
    temp <- as.data.frame(gene.clu.distri[grepl(max(gene.clu.distri), gene.clu.distri)])
    Cluster <- rownames(temp)
    # print(paste('Gene', interested.gene, 'belongs to', Cluster))
  } else if (SearchPathway == TRUE){
    annotation <- 'level3_pathway_name'
    # grep gene info list
    level3_path <- matched_kegg[grepl(interested.pathway, matched_kegg[annotation][,1]), ] 
    
    # extract cluster 
    path.clu.distri <- table(level3_path$Cluster)
    print(paste('Most of ', interested.gene, 'related gene belong to:'))
    print(path.clu.distri[grepl(max(path.clu.distri), path.clu.distri)])
    print(paste('The cluster distribution of', interested.pathway, 'is:'))
    print(path.clu.distri)
    
    # obtain cluster 
    temp <- as.data.frame(gene.clu.distri[grepl(max(gene.clu.distri), gene.clu.distri)])
    Cluster <- rownames(temp)
  }

  return(Cluster)
}

ObtainCorNet()

ObtainCorNet <- function(dataset, SubDataset, Cluster, group, group_sample.data, 
                         gene_name=TRUE, thre.p = 0.05, thre.r = 0.8){
  # obtain filtered pathway
  Filteredcluster.path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', Cluster, '_Enrichment.csv', sep = '') 
  Filteredcluster <- read.csv(Filteredcluster.path)
  
  # obtain pathway list & matching & obtain gene list
  rawdata <- obtain_pathdata(dataset, group, SubDataset, Cluster)
  
  # obtain ko descrption as labels
  kegg_anno.path <- paste("./data/", dataset, "/rawdata/kegg_annotation_all.csv", sep = '') 
  kegg_anno <- read.csv(kegg_anno.path)
  path_list <- Filteredcluster$level3_pathway_name
  
  # initial
  path <- path_list[1]
  # for loop 
  gene_list <- rawdata[grepl(path, rawdata$level3_pathway_name), ] 
  for (path in path_list[2:length(path_list)]){
    list <- rawdata[grepl(path, rawdata$level3_pathway_name), ]
    gene_list <- rbind(gene_list, list)
  }
  
  Col.name <- c('id', 
                group_sample.data$Samples, 
                # "SignalP", "tmhmm" , 
                "SwissProt_ID", "SwissProt_Description",
                  "level1_pathway_id", "level1_pathway_name", "level2_pathway_id", "level2_pathway_name", "level3_pathway_id", "level3_pathway_name",
                  'ko', 'ko_name', 'ko_des', 'level3_pathway_name')
  CorCol.name <- c('id', group_sample.data$Samples, 'ko_name')
  Hub_group_raw <- unique(gene_list[Col.name])
  Hub_group <- unique(Hub_group_raw[CorCol.name])
  merge_data <- unique(Hub_group)
  
  if (gene_name == TRUE) {
    # v1: merge gene_id and gene description
    gene_names <- data.frame(paste(merge_data$ko_name, ' (', merge_data$id, ')', sep = ''))
    colnames(gene_names) <- 'label'
    corr_data <- cbind(gene_names, merge_data[,2:(length(merge_data)-1)])
    rownames(corr_data) <- corr_data$label
    data <- t(corr_data[, -1])
  } else {
    # v2: only gene id
    corr_data <- merge_data
    rownames(corr_data) <- corr_data$id
    data <- t(corr_data[, -c(1, length(corr_data))])
  }
  
  # calculate correlation matrix
  library(psych)
  occor = corr.test(data,use="pairwise",method="spearman",adjust="fdr",alpha=0.05)
  occor.r = occor$r # 取相关性矩阵R值
  occor.p = occor$p # 取相关性矩阵p值
  occor.r[occor.p>thre.p | abs(occor.r) < thre.r ] = 0
  occor.path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', Cluster, '_p', thre.p , '_r', thre.r, '_CorNet.csv', sep = '')
  if (file.exists(occor.path)) {
    print(paste('File', occor.path, 'exists.'))
  } else {
    write.csv(occor.r, occor.path)
  }

  return(occor.r)
}

ObtainGraph()

ObtainGraph <- function(dataset, SubDataset, Cluster, group, group_sample.data, igraph, gene_name=TRUE, thre.p = 0.05, thre.r = 0.8){
  
  # === nodes ===
  E(igraph)$corr <- E(igraph)$weight
  E(igraph)$weight <- abs(E(igraph)$weight)
  
  # nodes number check
  print(paste('The total nodes number is:',length(V(igraph)$name))) #或 vcount(igraph)
  
  #节点度(Degree)
  V(igraph)$degree <- degree(igraph)
  # V(igraph)$degree
  
  # --- Visualization: Degree distribution ---
  if (ExploreDegree == TRUE){
    degree_dist <- degree.distribution(igraph)[-1]
    degree_num <- 1:max(V(igraph)$degree)
    par(mfrow = c(1, 2))
    hist(V(igraph)$degree, xlab = 'Degree', ylab = 'Frequency',
    main = 'Degree distribution')
    plot(degree_num, degree_dist, log = 'xy', 
         xlab = 'Log-degree',
         ylab = 'Log-intensity', 
         main = 'Log-log degree distribution')
  }
  
  # --- Exploratory Analysis: Relationship between node degrees and the average degree of its "neighbors" --- 
  if (ExploreNeibor == TRUE){
    neighbor_degree <- graph.knn(igraph, V(igraph))$knn
    plot(V(igraph)$degree, neighbor_degree, log = 'xy',
       xlab = 'Log degree', ylab = 'Log average neighbor degree')
  }
  
  #加权度(Weighted degree)
  V(igraph)$weight_degree <- strength(igraph)
  # V(igraph)$weight_degree
  #接近中心性(Closeness centrality)
  V(igraph)$closeness_centrality <- closeness(igraph)
  # V(igraph)$closeness_centrality
  #介数中心性(Betweenness centrality)
  V(igraph)$betweenness_centrality <- betweenness(igraph)
  # V(igraph)$betweenness_centrality
  #特征向量中心性(Eigenvector centrality)
  V(igraph)$eigenvector_centrality <- evcent(igraph)$vector
  # V(igraph)$eigenvector_centrality
  
  # --- Exploratory Analysis: The relationship between three features describing node centrality ---
  if (ExploreNeibor == TRUE){
    library(car)
    scatter3d(V(igraph)$closeness_centrality, V(igraph)$betweenness_centrality, V(igraph)$eigenvector_centrality, xlab = 'Closeness centrality', ylab = 'Betweenness centrality', zlab = 'Eigenvector centrality', surface = FALSE)
    # Relationship between node degree and node centrality, e.g. the relationship with feature vector centrality
    plot(V(igraph)$degree, V(igraph)$eigenvector_centrality,
    xlab = 'Degree', ylab = 'Eigenvector centrality')
  }
  
  # obtainn node list
  node_list <- data.frame(node_id = V(igraph)$name,
                          degree = V(igraph)$degree,
                          weight_degree = V(igraph)$weight_degree,
                          closeness_centrality = V(igraph)$closeness_centrality,
                          betweenness_centrality = V(igraph)$betweenness_centrality,
                          eigenvector_centrality = V(igraph)$eigenvector_centrality)
  
  
  # save node
  node.path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', Cluster, '_p', thre.p , '_r', thre.r, '_Node.csv', sep = '')
  write.table(node_list, node.path, sep = 't', row.names = FALSE, quote = FALSE)
  
  # === Edges ===
  #权重(Weighted)
  E(igraph)$weight
  #边介数中心性(Edge betweenness centrality)
  E(igraph)$betweenness_centrality <- edge.betweenness(igraph)
  E(igraph)$betweenness_centrality
  
  # obtain edge list
  edge <- data.frame(as_edgelist(igraph)) #igraph 
  edge_list <- data.frame(
  source = edge[[1]],
  target = edge[[2]],
  weight = E(igraph)$weight,
  correlation = E(igraph)$corr,
  betweenness_centrality = E(igraph)$betweenness_centrality
  )
  
  # save edge
  edge.path <- paste("./data/", dataset, "/output_data/R/", SubDataset, '_', Cluster, '_p', thre.p , '_r', thre.r, '_Edges.csv', sep = '')
  write.table(edge_list, edge.path, sep = 't', row.names = FALSE, quote = FALSE)
  
  graph_list <- list(node_list, edge_list)
  return(graph_list)
}

Obtain_landmark()

obtain_landmark <- function(graph_list, dataset, group, SubDataset, 
                            Cluster, group_sample.data){
  # landmark genes: merge nodes list with expression level 
  node_list <- graph_list[[1]]
  edge_list <- graph_list[[2]]
  node_list$id <- node_list$node_id
  # var_list <- basic_var(dataset, group, SubDataset)
  # matched_kegg <- var_list[[length(var_list)]]
  
  rawdata <- obtain_pathdata(dataset, group, SubDataset, Cluster)
  
  CorCol.name <- c('id', group_sample.data$Samples, 'ko_name', 'ko_des')
  landmark_info <- unique(rawdata[CorCol.name])
  landmark <- merge(node_list, landmark_info, by = 'id')
  
  landmark$Mean <- apply(landmark[group_sample.data$Samples], 1, mean)
  landmark.ordered <- landmark[order(landmark$Mean, decreasing = TRUE),]
  return(landmark.ordered)
}

DEMO1: Yellow light induced optogenetic denitrifier

APP1: 2D-Visualization of gene panels (cluster)

# setting parameters
dataset = 'LY_9samples_metatranscriptomics'
group = 'Yellow_vs_Dark'
SubDataset <- 'YvsD_SubCell_dgi'

var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
p <- Clustering(dataset, group, SubDataset)
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'cowplot'
## 
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p

# --- load functional genes ---
library(dplyr)

# setting parameters
dataset = 'LY_9samples_metatranscriptomics'
group = 'Yellow_vs_Dark'
SubDataset <- 'YvsD_SubCell_dgi'

var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
GeneSet_bycluster <- list('Cluster 1', 'Cluster 2','Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7')
GeneSet_byswiss <- list(c('nitrate', 'nitrite'))
GeneSet_bykegg <- list(c('Phototransduction'))

# Annotation(dataset, group, SubDataset, GeneSet_bykegg,
#            GeneSet_byswiss=0, 
#            GeneSet_bycluster=0)
Annotation(dataset, group, SubDataset, GeneSet_bykegg=0,
           GeneSet_byswiss=0, 
           GeneSet_bycluster)
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
## [1] "The total gene number of Cluster 1 is: 512"
## [1] "The total gene number of Cluster 2 is: 353"
## [1] "The total gene number of Cluster 3 is: 26"
## [1] "The total gene number of Cluster 4 is: 120"
## [1] "The total gene number of Cluster 5 is: 783"
## [1] "The total gene number of Cluster 6 is: 80"
## [1] "The total gene number of Cluster 7 is: 131"

APP2: Panel enrichment analysis

# rm(list=ls()) 
# --- setting parameters ---
# *dataset
dataset = 'LY_9samples_metatranscriptomics'
group = 'Yellow_vs_Dark'
SubDataset <- 'YvsD_SubCell_dgi'
Cluster = 'Cluster 3' 

# *set basic data for p-value and other statistic parameters calculations
ref_group <- 'Dark'
group_sample <- 'group_Yellow'
group_sample.path <- paste('./data/', dataset, '/inputdata/', group_sample, '.csv', sep = '')
group_sample.data <- read.csv(group_sample.path)
annotation.level <- 'level3_pathway_name'
# === pvalue and statistic analysis ===
rawdata <- obtain_pathdata(dataset, group, SubDataset, 
                            Cluster)
## [1] "The samples and groups are as follows:"
## [1] "Blue1.x" "Blue2.x" "Blue3.x" "Dark1"   "Dark2"   "Dark3"   "Yellow1"
## [8] "Yellow2" "Yellow3"
##   Samples  Group
## 1   Dark1   Dark
## 2   Dark2   Dark
## 3   Dark3   Dark
## 4   Blue1   Blue
## 5   Blue2   Blue
## 6   Blue3   Blue
## 7 Yellow1 Yellow
## 8 Yellow2 Yellow
## 9 Yellow3 Yellow
# *setting for visualization app
fig.name <- paste(Cluster, ' of ', group_sample, sep = '')
  
# --- obtain expression level by pathway ---
library(dplyr)
pathway <- rawdata %>% 
  group_by(level3_pathway_name) %>%
  summarise(Dark1mean = mean(Dark1), 
            Dark2mean = mean(Dark2),
            Dark3mean = mean(Dark3),
            Yellow1mean = mean(Yellow1), 
            Yellow2mean = mean(Yellow2),
            Yellow3mean = mean(Yellow3)) 

# --- obtain p-value ---
df.pathlevel_p <- obtain_pvalue(dataset,ref_group, group_sample.data,  pathway, annotation.level)
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:cowplot':
## 
##     get_legend
# --- obtain other statistic parameters ---
df.pathlevel_p <- ObtainStatistic(pathway, df.pathlevel_p, group_sample.data)

# *set threshod for filtering
p_thre <- 0.2
FC_up <- 2
FC_down <- 0.5
Expre <- 10

# --- conduct enrichment analysis ---
fig <- enrich(df.pathlevel_p, SubDataset, dataset, Cluster,
                   p_thre, FC_up, FC_down, Expre,
                   group_sample.data)
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [1] "File ./data/LY_9samples_metatranscriptomics/output_data/R/YvsD_SubCell_dgi_Cluster 3_Enrichment.csv Exist."
## Warning: ggrepel: 23 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
fig
## Warning: ggrepel: 29 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

APP3: Interested functions & genes analysis among gene panels

To conduct gene panels comparison, enrichment analysis of targeted panel is a prerequisite. Please make sure to have used APP2 for enrichment analysis with proper threshold.

  • In default, all the clusters are compared.

  • In this case, it can be observed clearly that cluster 3 is the hub gene panel with high expression level.

# setting parameters
dataset = 'LY_9samples_metatranscriptomics'
group = 'Yellow_vs_Dark'
SubDataset <- 'YvsD_SubCell_dgi'

p <- GPsEnrichCompare(dataset, group, SubDataset, k=7) 
## Warning: ggrepel: 130 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p
## Warning: ggrepel: 136 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

APP4: Topology Network with landmark genes regulatory strategy

  • Obtain topological properties & landmark genes

From app3, we can identify the Hub Gene Panels (HGPs) and Signaling Gene Panels (HGPs).

To better regulate the microbiome, we construct regulatory network, also called interaction network, on the genes level based on microbial genetic topology.

The occor.r and graph_list can also be utilized for visualization in Gephi or R.

# setting parameters
dataset <- 'LY_9samples_metatranscriptomics'
group <- 'Yellow_vs_Dark'
SubDataset <- 'YvsD_SubCell_dgi'
# *set basic data for statistic parameters calculations
ref_group <- 'Dark'
group_sample <- 'group_Yellow'
group_sample.path <- paste('./data/', dataset, '/inputdata/', group_sample, '.csv', sep = '')
group_sample.data <- read.csv(group_sample.path)

# --- gene panel setting ---
# *setting for gene searching: Obtain the targeted genes and corresponding gene panel
# method 1: manually set interested gene panel (cluster)
Cluster <- 'Cluster 3' # HGP
# method 2: search the gene panel that interested gene or pathway assigned to 
interested.gene <- 'nitrite'
interested.pathway <- 'Phototransduction'
SearchGene <- TRUE
SearchPathway <- FALSE
'
option: level3_pathway_name, ko_des, SwissProt_Description
'
## [1] "\noption: level3_pathway_name, ko_des, SwissProt_Description\n"
Cluster <- Search_GePa(var_list, SearchGene=TRUE, SearchPathway=FALSE)
## [1] "Most of nitrite related gene belong to:"
## Cluster 4 
##         5 
## [1] "The cluster distribution of nitrite is:"
## 
## Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6 
##         3         2         5         1         1
# === obtain correlation matrix: occor.r ===
occor.r <- ObtainCorNet(dataset, SubDataset, Cluster, group, group_sample.data, gene_name=TRUE, thre.p = 0.05, thre.r = 0.8)
## [1] "The samples and groups are as follows:"
## [1] "Blue1.x" "Blue2.x" "Blue3.x" "Dark1"   "Dark2"   "Dark3"   "Yellow1"
## [8] "Yellow2" "Yellow3"
##   Samples  Group
## 1   Dark1   Dark
## 2   Dark2   Dark
## 3   Dark3   Dark
## 4   Blue1   Blue
## 5   Blue2   Blue
## 6   Blue3   Blue
## 7 Yellow1 Yellow
## 8 Yellow2 Yellow
## 9 Yellow3 Yellow
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## [1] "File ./data/LY_9samples_metatranscriptomics/output_data/R/YvsD_SubCell_dgi_Cluster 4_p0.05_r0.8_CorNet.csv exists."
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
# parameter setting*
ExploreDegree = FALSE
ExploreNeibor = FALSE
adjacency_weight <- occor.r
igraph <- graph_from_adjacency_matrix(as.matrix(adjacency_weight), mode = 'undirected', weighted = TRUE, diag = FALSE)

# === obtain graph: graph_list ===
graph_list <- ObtainGraph(dataset, SubDataset, Cluster, group, group_sample.data, igraph, gene_name=TRUE, thre.p = 0.05, thre.r = 0.8)
## [1] "The total nodes number is: 22"
# --- obtain landmark genes ----
landmark.ordered <- obtain_landmark(graph_list, dataset, group, SubDataset, 
                            Cluster, group_sample.data)
## [1] "The samples and groups are as follows:"
## [1] "Blue1.x" "Blue2.x" "Blue3.x" "Dark1"   "Dark2"   "Dark3"   "Yellow1"
## [8] "Yellow2" "Yellow3"
##   Samples  Group
## 1   Dark1   Dark
## 2   Dark2   Dark
## 3   Dark3   Dark
## 4   Blue1   Blue
## 5   Blue2   Blue
## 6   Blue3   Blue
## 7 Yellow1 Yellow
## 8 Yellow2 Yellow
## 9 Yellow3 Yellow
landmark.top5 <- landmark.ordered$ko_name[1:5]

print(paste('The top 5 landmark genes of', Cluster, 'is:'))
## [1] "The top 5 landmark genes of Cluster 4 is:"
print(landmark.top5)
## [1] NA NA NA NA NA
  • Visualization
# 设置不同type的边颜色不同 
pale8 <- c("#8DD3C7FF","#FFFFB3FF","#BEBADAFF","#FB8072FF","#80B1D3FF","#FDB462FF","#B3DE69FF","#FCCDE5FF")
E(igraph)$color <- "#FB8072FF" 
index <- E(igraph)$type == "mention"
E(igraph)$color <- '#8DD3C7FF'

plot(igraph, 
     vertex.color = '#BEBADAFF', edge.color = E(igraph)$color,
     layout = layout_in_circle)

# cfg <- cluster_fast_greedy(igraph)
# plot(cfg, igraph)

DEMO2: Blue light induced optogenetic denitrifier

APP1: 2D-Visualization of gene panels (cluster)

# setting parameters
dataset = 'LY_9samples_metatranscriptomics'
group = 'Blue_vs_Dark'
SubDataset <- 'BvsD_SubCell_dgi'

var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
p <- Clustering(dataset, group, SubDataset)
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
p

# --- load functional genes ---
library(dplyr)

# setting parameters
dataset = 'LY_9samples_metatranscriptomics'
group = 'Blue_vs_Dark'
SubDataset <- 'BvsD_SubCell_dgi'

var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
GeneSet_bycluster <- list('Cluster 1', 'Cluster 2','Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7')
GeneSet_byswiss <- list(c('nitrate', 'nitrite'))
GeneSet_bykegg <- list(c('Phototransduction'))

# Annotation(dataset, group, SubDataset, GeneSet_bykegg,
#            GeneSet_byswiss=0, 
#            GeneSet_bycluster=0)
Annotation(dataset, group, SubDataset, GeneSet_bykegg=0,
           GeneSet_byswiss=0, 
           GeneSet_bycluster) 
## [1] "File ./data/LY_9samples_metatranscriptomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
## [1] "The total gene number of Cluster 1 is: 1838"
## [1] "The total gene number of Cluster 2 is: 767"
## [1] "The total gene number of Cluster 3 is: 4180"
## [1] "The total gene number of Cluster 4 is: 394"
## [1] "The total gene number of Cluster 5 is: 2105"
## [1] "The total gene number of Cluster 6 is: 899"
## [1] "The total gene number of Cluster 7 is: 744"
# The output are all unique genes

Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6 Cluster 7 4828 1795 9533 1273 4198 2061 1589

APP2: Panel enrichment analysis

# rm(list=ls()) 
# --- setting parameters ---
# *dataset
dataset = 'LY_9samples_metatranscriptomics'
group = 'Blue_vs_Dark'
SubDataset <- 'BvsD_SubCell_dgi'
Cluster = 'Cluster 7' 

# *set basic data for p-value and other statistic parameters calculations
ref_group <- 'Dark'
group_sample <- 'group_Blue'
group_sample.path <- paste('./data/', dataset, '/inputdata/', group_sample, '.csv', sep = '')
group_sample.data <- read.csv(group_sample.path)
annotation.level <- 'level3_pathway_name'
# === pvalue and statistic analysis ===
rawdata <- obtain_pathdata(dataset, group, SubDataset, 
                            Cluster)
## [1] "The samples and groups are as follows:"
## [1] "Blue1"     "Blue2"     "Blue3"     "Dark1"     "Dark2"     "Dark3"    
## [7] "Yellow1.x" "Yellow2.x" "Yellow3.x"
##   Samples  Group
## 1   Dark1   Dark
## 2   Dark2   Dark
## 3   Dark3   Dark
## 4   Blue1   Blue
## 5   Blue2   Blue
## 6   Blue3   Blue
## 7 Yellow1 Yellow
## 8 Yellow2 Yellow
## 9 Yellow3 Yellow
# *setting for visualization app
fig.name <- paste(Cluster, ' of ', group_sample, sep = '')
  
# --- obtain expression level by pathway ---
library(dplyr)
pathway <- rawdata %>% 
  group_by(level3_pathway_name) %>%
  summarise(Dark1mean = mean(Dark1), 
            Dark2mean = mean(Dark2),
            Dark3mean = mean(Dark3),
            Blue1mean = mean(Blue1), 
            Blue2mean = mean(Blue2),
            Blue3mean = mean(Blue3)) 

# --- obtain p-value ---
df.pathlevel_p <- obtain_pvalue(dataset,ref_group, group_sample.data,  pathway, annotation.level)

# --- obtain other statistic parameters ---
df.pathlevel_p <- ObtainStatistic(pathway, df.pathlevel_p, group_sample.data)

# *set threshod for filtering
if (Cluster == 'Cluster 4'){
  p_thre <- 0.05
  FC_up <- 2
  FC_down <- 0.2
  Expre <- 20
} else if (Cluster == 'Cluster 5'){
  p_thre <- 0.001 
  FC_up <- 10
  FC_down <- 0.5
  Expre <- 10
} else if (Cluster == 'Cluster 7'){
  p_thre <- 0.01
  FC_up <- 2
  FC_down <- 0.5
  Expre <- 10
} else if (Cluster == 'Cluster 2'){
  p_thre <- 0.001
  FC_up <- 2
  FC_down <- 0.5
  Expre <- 10
} else if (Cluster == 'Cluster 3'){
  p_thre <- 0.01
  FC_up <- 2
  FC_down <- 0.5
  Expre <- 5.5
} else if (Cluster == 'Cluster 1'){
  p_thre <- 0.01
  FC_up <- 2
  FC_down <- 0.5
  Expre <- 1.5
} else if (Cluster == 'Cluster 6'){
  p_thre <- 0.01
  FC_up <- 2
  FC_down <- 0.5
  Expre <- 10
}


# --- conduct enrichment analysis ---
fig <- enrich(df.pathlevel_p, SubDataset, dataset, Cluster,
                   p_thre, FC_up, FC_down, Expre,
                   group_sample.data)
## [1] "File ./data/LY_9samples_metatranscriptomics/output_data/R/BvsD_SubCell_dgi_Cluster 7_Enrichment.csv Exist."
fig

APP3: Interested functions & genes analysis among gene panels

To conduct gene panels comparison, enrichment analysis of targeted panel is a prerequisite. Please make sure to have used APP2 for enrichment analysis with proper threshold.

  • In default, all the clusters are compared.

  • In this case, it can be observed clearly that cluster 3 is the hub gene panel with high expression level.

# setting parameters
dataset = 'LY_9samples_metatranscriptomics'
group = 'Blue_vs_Dark'
SubDataset <- 'BvsD_SubCell_dgi'

p <- GPsEnrichCompare(dataset, group, SubDataset, k=7) 
## Warning: ggrepel: 100 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p
## Warning: ggrepel: 101 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

APP4: Topology Network with landmark genes regulatory strategy

# setting parameters
dataset = 'LY_9samples_metatranscriptomics'
group = 'Blue_vs_Dark'
SubDataset <- 'BvsD_SubCell_dgi'
# *set basic data for statistic parameters calculations
ref_group <- 'Dark'
group_sample <- 'group_Blue'
group_sample.path <- paste('./data/', dataset, '/inputdata/', group_sample, '.csv', sep = '')
group_sample.data <- read.csv(group_sample.path)

# --- gene panel setting ---
# *setting for gene searching: Obtain the targeted genes and corresponding gene panel
# method 1: manually set interested gene panel (cluster)
Cluster <- 'Cluster 3' # HGP
# method 2: search the gene panel that interested gene or pathway assigned to 
interested.gene <- 'nitrite'
interested.pathway <- 'Phototransduction'
SearchGene <- TRUE
SearchPathway <- FALSE
'
option: level3_pathway_name, ko_des, SwissProt_Description
'
## [1] "\noption: level3_pathway_name, ko_des, SwissProt_Description\n"
Cluster <- Search_GePa(var_list, SearchGene=TRUE, SearchPathway=FALSE)
## [1] "Most of nitrite related gene belong to:"
## Cluster 3 
##        31 
## [1] "The cluster distribution of nitrite is:"
## 
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6 Cluster 7 
##        16        12        31         9         6         5         9
# === obtain correlation matrix: occor.r ===
occor.r <- ObtainCorNet(dataset, SubDataset, Cluster, group, group_sample.data, gene_name=FALSE, thre.p = 0.05, thre.r = 0.8)
## [1] "The samples and groups are as follows:"
## [1] "Blue1"     "Blue2"     "Blue3"     "Dark1"     "Dark2"     "Dark3"    
## [7] "Yellow1.x" "Yellow2.x" "Yellow3.x"
##   Samples  Group
## 1   Dark1   Dark
## 2   Dark2   Dark
## 3   Dark3   Dark
## 4   Blue1   Blue
## 5   Blue2   Blue
## 6   Blue3   Blue
## 7 Yellow1 Yellow
## 8 Yellow2 Yellow
## 9 Yellow3 Yellow
## [1] "File ./data/LY_9samples_metatranscriptomics/output_data/R/BvsD_SubCell_dgi_Cluster 3_p0.05_r0.8_CorNet.csv exists."
library(igraph)
# parameter setting*
ExploreDegree = FALSE
ExploreNeibor = FALSE
adjacency_weight <- occor.r
igraph <- graph_from_adjacency_matrix(as.matrix(adjacency_weight), mode = 'undirected', weighted = TRUE, diag = FALSE)

# === obtain graph: graph_list ===
graph_list <- ObtainGraph(dataset, SubDataset, Cluster, group, group_sample.data, igraph, gene_name=FALSE, thre.p = 0.05, thre.r = 0.8) 
## [1] "The total nodes number is: 470"
# --- obtain landmark genes ----
landmark.ordered <- obtain_landmark(graph_list, dataset, group, SubDataset, 
                            Cluster, group_sample.data)
## [1] "The samples and groups are as follows:"
## [1] "Blue1"     "Blue2"     "Blue3"     "Dark1"     "Dark2"     "Dark3"    
## [7] "Yellow1.x" "Yellow2.x" "Yellow3.x"
##   Samples  Group
## 1   Dark1   Dark
## 2   Dark2   Dark
## 3   Dark3   Dark
## 4   Blue1   Blue
## 5   Blue2   Blue
## 6   Blue3   Blue
## 7 Yellow1 Yellow
## 8 Yellow2 Yellow
## 9 Yellow3 Yellow
landmark.top5 <- landmark.ordered$ko_name[1:5]

print(paste('The top 5 landmark genes of', Cluster, 'is:'))
## [1] "The top 5 landmark genes of Cluster 3 is:"
print(landmark.top5)
## [1] "fusA, GFM, EFG"  "folD"            "E3.1.11.2, xthA" "fusA, GFM, EFG" 
## [5] "trpB"
  • Visualization: network
# 设置不同type的边颜色不同 
pale8 <- c("#8DD3C7FF","#FFFFB3FF","#BEBADAFF","#FB8072FF","#80B1D3FF","#FDB462FF","#B3DE69FF","#FCCDE5FF")
E(igraph)$color <- "#FB8072FF" 
index <- E(igraph)$type == "mention"
E(igraph)$color <- '#8DD3C7FF'

plot(igraph, 
     vertex.color = '#BEBADAFF', edge.color = E(igraph)$color,
     layout = layout_in_circle)

cfg <- cluster_fast_greedy(igraph)
plot(cfg, igraph)

DEMO3: Yellow light induced denitrifier based on metagenomics

Graph Learning Results

  • The gene panels are obtained after 20000 epochs

    feature shape: (5, 57)

    cuda is not available

    Threshold: 4

    Links number: 541

    Average Links: 9.491228070175438

    Adj: (57, 57) Edges: 541

    X: (57, 5)

  • Result; SCI score (Clustering quality) is: 0.59939337

  • Tips: increase threshold will bring about more edges, that more information can be passed to the neighboring nodes, while the calculation time will also be longer and probably lead to over-fitting.

APP1: 2D-Visualization of gene panels (cluster)

# rm(list=ls()) 
# --- load functional genes ---
library(dplyr)

# setting parameters
dataset = 'LY_15samples_metagenomics'
group = 'Yellow_vs_Dark'
SubDataset <- 'Yellow_vs_Dark'

var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
p <- Clustering(dataset, group, SubDataset)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
p

APP2

  • output:
[1] "The total gene number of Cluster 1 is: 7"
[1] "The total gene number of Cluster 2 is: 9"
[1] "The total gene number of Cluster 3 is: 3"
[1] "The total gene number of Cluster 4 is: 7"
[1] "The total gene number of Cluster 5 is: 15"
[1] "The total gene number of Cluster 6 is: 8"
[1] "The total gene number of Cluster 7 is: 8"
# rm(list=ls()) 
dataset = 'LY_15samples_metagenomics'
group = 'Yellow_vs_Dark'
SubDataset <- 'Yellow_vs_Dark'

var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
GeneSet_bycluster <- list('Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7')
# GeneSet_byswiss <- list(c('nitrate', 'nitrite'))
# GeneSet_bykegg <- list(c('Phototransduction'))

# Annotation(dataset, group, SubDataset, GeneSet_bykegg,
#            GeneSet_byswiss=0, 
#            GeneSet_bycluster=0)
Annotation(dataset, group, SubDataset, GeneSet_bykegg=0,
           GeneSet_byswiss=0, 
           GeneSet_bycluster)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Yellow_vs_Darklabeled.csv exists."
## [1] "The total gene number of Cluster 1 is: 7"
## [1] "The total gene number of Cluster 2 is: 9"
## [1] "The total gene number of Cluster 3 is: 3"
## [1] "The total gene number of Cluster 4 is: 7"
## [1] "The total gene number of Cluster 5 is: 15"
## [1] "The total gene number of Cluster 6 is: 8"
## [1] "The total gene number of Cluster 7 is: 8"
# rm(list=ls()) 
# --- setting parameters ---
# *dataset
dataset = 'LY_15samples_metagenomics'
group = 'Yellow_vs_Dark'
SubDataset <- 'Yellow_vs_Dark'
Cluster = 'Cluster 5' 

# *set basic data for p-value and other statistic parameters calculations
ref_group <- 'Dark'
group_sample <- 'group_Yellow'
group_sample.path <- paste('./data/', dataset, '/inputdata/', group_sample, '.csv', sep = '')
group_sample.data <- read.csv(group_sample.path)
annotation.level <- 'level3_pathway_name'
# === pvalue and statistic analysis ===
rawdata <- obtain_pathdata(dataset, group, SubDataset, 
                            Cluster)
## [1] "The samples and groups are as follows:"
##  [1] "G3A" "G2A" "R3A" "D2A" "Y1A" "Y2A" "B2A" "R1A" "D3A" "B1A" "Y3A" "B3A"
## [13] "D1A" "G1A" "R2A"
##    Samples  Group
## 1      D2A   Dark
## 2      D3A   Dark
## 3      D1A   Dark
## 4      B2A   Blue
## 5      B1A   Blue
## 6      B3A   Blue
## 7      G3A  Green
## 8      G2A  Green
## 9      G1A  Green
## 10     R3A    Red
## 11     R1A    Red
## 12     R2A    Red
## 13     Y1A Yellow
## 14     Y2A Yellow
## 15     Y3A Yellow
# *setting for visualization app
fig.name <- paste('HGP of ', group_sample, sep = '')
  
# --- obtain expression level by pathway ---
library(dplyr)
pathway <- rawdata %>% 
  group_by(level3_pathway_name) %>%
  summarise(Dark1mean = mean(D1A), 
            Dark2mean = mean(D2A),
            Dark3mean = mean(D3A),
            Yellow1mean = mean(Y1A), 
            Yellow2mean = mean(Y2A),
            Yellow3mean = mean(Y3A)) 

# --- obtain p-value ---
df.pathlevel_p <- obtain_pvalue(dataset,ref_group, group_sample.data,  pathway, annotation.level)

# --- obtain other statistic parameters ---
df.pathlevel_p <- ObtainStatistic(pathway, df.pathlevel_p, group_sample.data)

# *set threshod for filtering
p_thre <- 0.275
FC_up <- 1
FC_down <- 1
Expre <- 0

# --- conduct enrichment analysis ---
fig <- enrich(df.pathlevel_p, SubDataset, dataset, Cluster,
                   p_thre, FC_up, FC_down, Expre,
                   group_sample.data)
## [1] "File ./data/LY_15samples_metagenomics/output_data/R/Yellow_vs_Dark_Cluster 5_Enrichment.csv Exist."
fig

DEMO3: Blue light induced metagenomics

Graph learning

feature shape: (5, 88)

cuda is not available

Threshold: 2

Links number: 1040

Average Links: 11.818181818181818

Adj: (88, 88) Edges: 1040

X: (88, 5)

  • SCI score (Clustering quality) is: 0.60509825

APP1: 2D-Visualization of gene panels (cluster)

# --- load functional genes ---
library(dplyr)

# setting parameters
dataset = 'LY_15samples_metagenomics'
group = 'Blue_vs_Dark'
SubDataset <- 'Blue_vs_Dark'

var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
p <- Clustering(dataset, group, SubDataset)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
p

[1] "The total gene number of Cluster 1 is: 19"
[1] "The total gene number of Cluster 2 is: 10"
[1] "The total gene number of Cluster 3 is: 7"
[1] "The total gene number of Cluster 4 is: 20"
[1] "The total gene number of Cluster 5 is: 8"
[1] "The total gene number of Cluster 6 is: 11"
[1] "The total gene number of Cluster 7 is: 13"
# setting parameters
dataset = 'LY_15samples_metagenomics'
group = 'Blue_vs_Dark'
SubDataset <- 'Blue_vs_Dark'

var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
GeneSet_bycluster <- list('Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7')
# GeneSet_byswiss <- list(c('nitrate', 'nitrite'))
# GeneSet_bykegg <- list(c('Phototransduction'))

# Annotation(dataset, group, SubDataset, GeneSet_bykegg,
#            GeneSet_byswiss=0, 
#            GeneSet_bycluster=0)
Annotation(dataset, group, SubDataset, GeneSet_bykegg=0,
           GeneSet_byswiss=0, 
           GeneSet_bycluster)
## [1] "File ./data/LY_15samples_metagenomics/inputdata/graphdata_Blue_vs_Darklabeled.csv exists."
## [1] "The total gene number of Cluster 1 is: 19"
## [1] "The total gene number of Cluster 2 is: 10"
## [1] "The total gene number of Cluster 3 is: 7"
## [1] "The total gene number of Cluster 4 is: 20"
## [1] "The total gene number of Cluster 5 is: 8"
## [1] "The total gene number of Cluster 6 is: 11"
## [1] "The total gene number of Cluster 7 is: 13"

APP2: Panel enrichment analysis

  • ‘group_blue.csv’ need to be manually created
# rm(list=ls()) 
# --- setting parameters ---
# *dataset
dataset = 'LY_15samples_metagenomics'
group = 'Blue_vs_Dark'
SubDataset <- 'Blue_vs_Dark'
Cluster = 'Cluster 4' # 410 genes

# *set basic data for p-value and other statistic parameters calculations
ref_group <- 'Dark'
group_sample <- 'group_Blue'
group_sample.path <- paste('./data/', dataset, '/inputdata/', group_sample, '.csv', sep = '')
group_sample.data <- read.csv(group_sample.path)
annotation.level <- 'level3_pathway_name'
# === pvalue and statistic analysis ===
rawdata <- obtain_pathdata(dataset, group, SubDataset, 
                            Cluster)
## [1] "The samples and groups are as follows:"
##  [1] "G3A" "G2A" "R3A" "D2A" "Y1A" "Y2A" "B2A" "R1A" "D3A" "B1A" "Y3A" "B3A"
## [13] "D1A" "G1A" "R2A"
##    Samples  Group
## 1      D2A   Dark
## 2      D3A   Dark
## 3      D1A   Dark
## 4      B2A   Blue
## 5      B1A   Blue
## 6      B3A   Blue
## 7      G3A  Green
## 8      G2A  Green
## 9      G1A  Green
## 10     R3A    Red
## 11     R1A    Red
## 12     R2A    Red
## 13     Y1A Yellow
## 14     Y2A Yellow
## 15     Y3A Yellow
# *setting for visualization app
fig.name <- paste('HGP of ', group_sample, sep = '')
  
# --- obtain expression level by pathway ---
library(dplyr)
pathway <- rawdata %>% 
  group_by(level3_pathway_name) %>%
  summarise(Dark1mean = mean(D1A), 
            Dark2mean = mean(D2A),
            Dark3mean = mean(D3A),
            Blue1mean = mean(B1A), 
            Blue2mean = mean(B2A),
            Blue3mean = mean(B3A)) 

# --- obtain p-value ---
df.pathlevel_p <- obtain_pvalue(dataset,ref_group, group_sample.data,  pathway, annotation.level)

# --- obtain other statistic parameters ---
df.pathlevel_p <- ObtainStatistic(pathway, df.pathlevel_p, group_sample.data)

# *set threshod for filtering
p_thre <- 0.275
FC_up <- 1
FC_down <- 1
Expre <- 0

# --- conduct enrichment analysis ---
fig <- enrich(df.pathlevel_p, SubDataset, dataset, Cluster,
                   p_thre, FC_up, FC_down, Expre,
                   group_sample.data)
## [1] "File ./data/LY_15samples_metagenomics/output_data/R/Blue_vs_Dark_Cluster 4_Enrichment.csv Exist."
fig

DEMO4: EES-NHQ

Graoh learning

dataset_name = ‘ZJ_12samplesEES_metagenomics’

group = ‘NHQ_vs_Control’

feature shape: (5, 964)

cuda is not available

Threshold: 1

Links number: 12492

Average Links: 12.95850622406639

Adj: (964, 964) Edges: 12492

X: (964, 5)

  • SCI score (Clustering quality) is: 0.5108492

APP1: 2D-Visualization of gene panels (cluster)

# rm(list=ls()) 
# setting parameters
dataset = 'ZJ_12samplesEES_metagenomics'
group = 'NHQ_vs_Control'
SubDataset <- 'NHQ_vs_Control'

var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/ZJ_12samplesEES_metagenomics/inputdata/graphdata_NHQ_vs_Controllabeled.csv exists."
p <- Clustering(dataset, group, SubDataset)
## [1] "File ./data/ZJ_12samplesEES_metagenomics/inputdata/graphdata_NHQ_vs_Controllabeled.csv exists."
p

  • Output

    [1] "The total gene number of Cluster 1 is: 213"
    [1] "The total gene number of Cluster 2 is: 64"
    [1] "The total gene number of Cluster 3 is: 58"
    [1] "The total gene number of Cluster 4 is: 410"
    [1] "The total gene number of Cluster 5 is: 8"
    [1] "The total gene number of Cluster 6 is: 3"
    [1] "The total gene number of Cluster 7 is: 15"
# setting parameters
dataset = 'ZJ_12samplesEES_metagenomics'
group = 'NHQ_vs_Control'
SubDataset <- 'NHQ_vs_Control'

var_list <- basic_var(dataset, group, SubDataset)
## [1] "File ./data/ZJ_12samplesEES_metagenomics/inputdata/graphdata_NHQ_vs_Controllabeled.csv exists."
GeneSet_bycluster <- list('Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7')
# GeneSet_byswiss <- list(c('nitrate', 'nitrite'))
# GeneSet_bykegg <- list(c('Phototransduction'))

# Annotation(dataset, group, SubDataset, GeneSet_bykegg,
#            GeneSet_byswiss=0, 
#            GeneSet_bycluster=0)
Annotation(dataset, group, SubDataset, GeneSet_bykegg=0,
           GeneSet_byswiss=0, 
           GeneSet_bycluster)
## [1] "File ./data/ZJ_12samplesEES_metagenomics/inputdata/graphdata_NHQ_vs_Controllabeled.csv exists."
## [1] "The total gene number of Cluster 1 is: 213"
## [1] "The total gene number of Cluster 2 is: 64"
## [1] "The total gene number of Cluster 3 is: 58"
## [1] "The total gene number of Cluster 4 is: 410"
## [1] "The total gene number of Cluster 5 is: 8"
## [1] "The total gene number of Cluster 6 is: 3"
## [1] "The total gene number of Cluster 7 is: 15"

APP2: Panel enrichment analysis

  • Obtain Pathway

  • group_by, summarise: name need to be set based on the samples’ name

# rm(list=ls()) 
# --- setting parameters ---
# *dataset
dataset <- 'ZJ_12samplesEES_metagenomics'
group = 'NHQ_vs_Control'
SubDataset <- 'NHQ_vs_Control'
Cluster = 'Cluster 4' # 410 genes

# *set basic data for p-value and other statistic parameters calculations
ref_group <- 'Control'
group_sample <- 'group_NHQ'
group_sample.path <- paste('./data/', dataset, '/inputdata/', group_sample, '.csv', sep = '')
group_sample.data <- read.csv(group_sample.path)
annotation.level <- 'level3_pathway_name'
# === pvalue and statistic analysis ===
rawdata <- obtain_pathdata(dataset, group, SubDataset, 
                            Cluster)
## [1] "The samples and groups are as follows:"
##  [1] "z9_1"  "z10_3" "z10_1" "z9_2"  "z11_2" "z11_3" "z10_2" "z11_1" "z1_3" 
## [10] "z9_3"  "z1_1"  "z1_2" 
##    Samples   Group
## 1     z1_3 Control
## 2     z1_1 Control
## 3     z1_2 Control
## 4    z11_2      MV
## 5    z11_3      MV
## 6    z11_1      MV
## 7    z10_3     NHQ
## 8    z10_1     NHQ
## 9    z10_2     NHQ
## 10    z9_1      NR
## 11    z9_2      NR
## 12    z9_3      NR
# *setting for visualization app
fig.name <- paste('HGP of ', group_sample, sep = '')
  
# --- obtain expression level by pathway ---
library(dplyr)
pathway <- rawdata %>% 
  group_by(level3_pathway_name) %>%
  summarise(Control1mean = mean(z1_1), 
            Control2mean = mean(z1_2),
            Control3mean = mean(z1_3),
            NHQ1mean = mean(z10_1), 
            NHQ2mean = mean(z10_2),
            NHQ3mean = mean(z10_3)) 

# --- obtain p-value ---
df.pathlevel_p <- obtain_pvalue(dataset,ref_group, group_sample.data,  pathway, annotation.level)

# --- obtain other statistic parameters ---
df.pathlevel_p <- ObtainStatistic(pathway, df.pathlevel_p, group_sample.data)

# *set threshod for filtering
p_thre <- 0.275
FC_up <- 1
FC_down <- 1
Expre <- 1

# --- conduct enrichment analysis ---
fig <- enrich(df.pathlevel_p, SubDataset, dataset, Cluster,
                   p_thre, FC_up, FC_down, Expre,
                   group_sample.data)
## [1] "File ./data/ZJ_12samplesEES_metagenomics/output_data/R/NHQ_vs_Control_Cluster 4_Enrichment.csv Exist."
fig