Set Working Directiory :

setwd("/Users/Vartika_Bisht/Individual_Project")

Load all source codes :

source("Penalty_Function.R")
source("Incorporate_Groups.R")
source("Libraries_Needed.R")
source("Borrowed_Functions.R")
source("MicrobiomeAnalyst.R")
source("best_epsilon_DBSCAN.R")

Load Data Set :

# Load Dataset 1
Data_Set_1 <- read.xlsx("NFnetFU_Dataset1_wih_labels.xlsx", sheetIndex = 1)

# Choose Microbiome Data
df_data1 <- as.data.frame(Data_Set_1[,3:49])
data1 <- data.matrix(df_data1)

# Create labels for prediction ( 2nd column )
label_dat <- as.numeric(factor(Data_Set_1$Class))

Input Dataset :

head(df_data1)

Input Data for Module 1 (Features) :

head(as.data.frame(data1))

Input Data for Module 1 (Output Variable: Given Variable) :

#Labels
Data_Set_1$Class
 [1] "HC"     "HC"     "HC"     "HC"     "HC"     "HC"     "HC"     "HC"     "HC"     "HC"     "PSC-UC"
[12] "PSC-UC" "PSC-UC" "PSC-UC" "PSC-UC" "PSC-UC" "PSC-UC" "PSC-UC" "PSC-UC" "PSC-UC" "UC"     "UC"    
[23] "UC"     "UC"     "UC"     "UC"     "UC"     "UC"     "UC"     "UC"    

Input Data for Module 1 (Output Variable: Desired Labels) :

#Desired Labels
label_dat
 [1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3

Module 1:

## Input : Numeric Labels(label_dat) and Microbiome Abundance Data(data1)
source("Module_1.R")

  |                                                                                                   
  |                                                                                             |   0%
  |                                                                                                   
  |=========                                                                                    |  10%
  |                                                                                                   
  |===================                                                                          |  20%
  |                                                                                                   
  |============================                                                                 |  30%
  |                                                                                                   
  |=====================================                                                        |  40%
  |                                                                                                   
  |==============================================                                               |  50%
  |                                                                                                   
  |========================================================                                     |  60%
  |                                                                                                   
  |=================================================================                            |  70%
  |                                                                                                   
  |==========================================================================                   |  80%
  |                                                                                                   
  |====================================================================================         |  90%
  |                                                                                                   
  |=============================================================================================| 100%
[1] "ANFIS DONE!"
[1] "New labels have been assigned!"
[1] "Rule based matrix is saved!"
[1] "Scaled Ruled Based Matrix saved"
## Output : Rule Based Matrix (rules_int) , Scaled Rule Based Matrix (scaled_rules_int) and Labels (label_dat)

Module 1 Output (Rule Based Matrix):

head(as.data.frame(rules_int))

Module 1 Output (Labels):

as.data.frame(label_dat)[,1]
 [1] 1 3 1 2 3 3 1 3 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3

Module 2:

## Input : Scaled Rule Based Matrix (scaled_rules_int)
source("Module_2.R")
[1] "Epsilon value used : 3.5"
[1] "13 cluster(s) found!"
[1] "Clustering Done!"
[1] "Feature's cluster number saved"
[1] "Grouping Highly Colinear Features Together :-"
[1] "Clubbing features in a group together"
[1] "Features Clubbed and incorporated in a new Data Frame!"
[1] "Rule Based matrix with Colinearity Handled saved"
[1] "PCA Loadings used to combine groups saved"
## Output : Rule Based matrix with Colinearity Handled (new_data1) and PCA Loadings used to combine groups (PCA_loadings) 

Module 2 Output (Clusters):

groups_we_need
[[1]]
[1] "c__Gammaproteobacteria" "f__Enterobacteriaceae"  "o__Enterobacteriales"  

[[2]]
[1] "f__Myxococcales.0319.6G20" "s__Myxococcales.sp"       

[[3]]
[1] "c__Lentisphaeria" "o__Victivallales" "p__Lentisphaerae"

[[4]]
[1] "g__Streptococcus"     "f__Brevibacteriaceae" "s__Brevibacterium.sp" "c__Bacilli"          

[[5]]
[1] "c__Coriobacteriia"    "o__Coriobacteriales"  "f__Coriobacteriaceae"

[[6]]
[1] "g__Paraprevotella"   "g__Paraprevotella.1"

[[7]]
[1] "g__Erysipelotrichaceae.cc_115"    "s__Erysipelotrichaceae.cc_115.sp"

[[8]]
[1] "s__Desulfovibrionaceae.sp"           "g__Desulfovibrionaceae.unidentified"

[[9]]
[1] "s__Sphingomonas.sp" "g__Actinomyces"     "s__Actinomyces"     "s__Roseburia.sp"   

[[10]]
[1] "s__Tepidimonas.sp" "g__Tepidimonas"   

[[11]]
[1] "g__Rothia"        "s__mucilaginosa."

[[12]]
[1] "g__Shewanella"     "f__Shewanellaceae"

[[13]]
[1] "s__Parvimonas.sp" "g__Parvimonas"   

Module 2 Output (PCA Loadings):

head(as.data.frame(t(PCA_loadings)))

Module 2 Output (New Data Frame):

head(as.data.frame(new_data1))

Module 3:

## Input : Rule Based matrix with Colinearity Handled (new_data1) and PCA Loadings used to combine groups (PCA_loadings) 
source("Module_3.R")
[1] "Feature scores computed and saved"
## Output : Feature Parameters (feature_score)

Module 3 Output (Adaptive LASSO Results):

head(as.data.frame(feature_scores))

Module 4 (TSEA - Specify Disease):

#Diseases to look for in TSEA
disease <- c("Colorectal","Crohn","Colon")

Module 4 (TSEA Type of feature):

TSEA_feature <- "Microbes"

Module 4 (TSEA - OTU): If Features are OTU and need to be changed into appropriate Microbes for TSEA

if(TSEA_feature == "OTU"){
#List of Microboes from selected features (OTU)
#OTU to Microbes
OTU_file <- read.table("/Users/Vartika_Bisht/Documents/GitHub/Microbiome_Data_Analysis/Data/NFnetFU_Dataset3_taxanomy.csv", header = 1)
OTU_index <- which(OTU_file$OTU %in% rownames(feature_scores))
selected_OTU <- OTU_file[OTU_index,]
feature_inorder <- selected_OTU$OTU
write.csv(selected_OTU,"OTU Microbes Selected Table.csv")

#Valid Microbe Names
OTU_network <- c()
taxa <- strsplit(as.character(selected_OTU$Taxonomy),";")
for(i in 1:length(taxa)){
  if(taxa[[i]][1] == "unclassified(100)"){
    OTU_network <- c(OTU_network,"unclassified")
  }else{
     for(j in rev(taxa[[i]])){
    mname <- substr( j , 1 , nchar(j)-5)
    if(mname != "unclassified"){
      OTU_network <- c(OTU_network,mname)
      break()}
    }
  }
}

Name_Change <- as.data.frame(OTU_network)
rownames(Name_Change) <- feature_inorder
write.csv(Name_Change,"Features to Microbes for TSEA.csv")

}

Module 4 (TSEA - Microbes of Different Taxa Level): If Features are Microbes of Different Taxa Level and need to be changed into appropriate Microbes for TSEA

if(TSEA_feature == "Microbes"){
  #List of Microboes from selected features (Microbes)
  Microbes_name <- substring(colnames(rules_int),4)
  #Microbes_name <- colnames(rules_int)
  OTU_network <- c()
  for(i in Microbes_name){
    n <- strsplit(i,split="[\\|.,+]+")[[1]]
    if((length(n)>1)&&(n[1] == "uncultured")||(length(n)>1)&&(n[1] == "X")){
      if((n[2] == '1')||(n[2] == '2')){
        OTU_network <- c(OTU_network,'uncultured')
      }else{ OTU_network <- c(OTU_network,n[2])}
    }else{ OTU_network <- c(OTU_network,n[1])}
  }
  feature_inorder <- colnames(rules_int)
  
  Name_Change <- as.data.frame(OTU_network)
  rownames(Name_Change) <- feature_inorder
  write.csv(Name_Change,"Features to Microbes for TSEA.csv")
}

Module 4 (TSEA - The names used for TSEA with the feature associated):

head(as.data.frame(Name_Change))

Module 4 (TSEA - The names used for TSEA with the feature associated):

Microbes <- unique(Name_Change$OTU_network)
Microbes <- as.character(Microbes)
Microbes
 [1] "Gammaproteobacteria" "Enterobacteriaceae"  "Enterobacteriales"   "Prevotellaceae"     
 [5] "Paraprevotellaceae"  "Myxococcales"        "Lentisphaeria"       "Victivallales"      
 [9] "Lentisphaerae"       "Streptococcus"       "Coriobacteriia"      "Coriobacteriales"   
[13] "Coriobacteriaceae"   "Paraprevotella"      "Blautia"             "Erysipelotrichaceae"
[17] "symbiosum"           "Desulfovibrionaceae" "Brevibacteriaceae"   "Brevibacterium"     
[21] "ramosum"             "Sphingomonas"        "Tepidimonas"         "Microbacteriaceae"  
[25] "hathewayi"           "Staphylococcus"      "adolescentis"        "Rothia"             
[29] "citroniae"           "mucilaginosa"        "distasonis"          "Actinomyces"        
[33] "stutzeri"            "Alteromonadales"     "Shewanella"          "Shewanellaceae"     
[37] "Bacilli"             "Roseburia"           "Parvimonas"          "fragilis"           

Module 4 (TSEA - The names used for TSEA with the feature associated):

## Input : List of Microbes
source("Module_4(TSEA Network).R")
[1] "----Microbiome Analyst----"
[1] "Init MicrobiomeAnalyst!"
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0
 43  217k   43 97938    0     0  32386      0  0:00:06  0:00:03  0:00:03 32376
100  217k  100  217k    0     0  69256      0  0:00:03  0:00:03 --:--:-- 69234
[1] "Loaded files from MetaboAnalyst web-server."

 [11%] Downloaded 16018 bytes...
 [23%] Downloaded 32402 bytes...
 [35%] Downloaded 48786 bytes...
 [47%] Downloaded 65170 bytes...
 [59%] Downloaded 81554 bytes...
 [71%] Downloaded 97938 bytes...
 [83%] Downloaded 114322 bytes...
 [95%] Downloaded 130706 bytes...
 [100%] Downloaded 136784 bytes...
[1] "Loaded files from MetaboAnalyst web-server."
[1] "No matches were found in the selected taxon set library!"
[1] "Mixed-level taxa set was selected!"
[1] "Mix Taxa TSEA Results Calculated"
[1] "Mix Taxa TSEA Disease Specific Results Calculated"
[1] "Calculating Adjacency Matrix for Network"
## Output : Network and Network Legends with Node size (Legends)

Module 4 (TSEA Network):

plot(g, layout=layout_in_circle, vertex.size=vertex_wt,edge.width = E(g)$weight)

Module 4 (TSEA Network Legends):

as.data.frame(Network_Info)

Module 4 (Infusing Data Driven Information): TSEA Network and Adaptive LASSO Results

## Input : TSEA Network and Adaptive LASSO Results
Cluster_Parameters <- c()
Cluster_OTU_name <- c()
for (i in Network_Info[,"Microbe Names"]) {
  index <- which(OTU_network %in% i)
  OTU <- as.character(feature_inorder[index])
  Cluster_OTU_name <- append(Cluster_OTU_name,list(OTU))
  if(length(OTU) > 1){
    OTUs_val <- c()
    for (j in OTU) {
      OTUs_val <- c( OTUs_val , abs(feature_scores[j,]) )
    }
    CP <- (sum(OTUs_val)/length(OTUs_val))[1]
  } else {
    CP <- abs(feature_scores[OTU,])
  }
  Cluster_Parameters <- c( Cluster_Parameters , CP )
}
Data_Bio_Driven <- cbind(Network_Info,Cluster_Parameters)
rownames(Data_Bio_Driven) <- NULL
colnames(Data_Bio_Driven)[4] <- "Node Score"
write.csv(Data_Bio_Driven,"Biological Network with Data Driven Results fused.csv")
## Output : Data Driven Cluster Parameters added

Module 4 (Infusing Data Driven Information): TSEA Network and Adaptive LASSO Results

as.data.frame(Data_Bio_Driven)

Module 4 (Infusing Data Driven Information): Module 2 Clusters and TSEA Network

## Input : Module 2 Clusters and TSEA Network
source("Module_4(Data Driven Network).R")
[1] "Calculating Edges and Nodes to be added for the Data Driven Network"
[1] "Final Fused Network Saved!"
[1] "Final Fused Network Cluster Information Saved!"
## Output : Network with Data Driven Clusters

Module 4 (Infusing Data Driven Information): Module 2 Clusters and TSEA Network

#Green Edges <- TSEA
#Red Edges <- Data Driven Cluslters
plot(gh, layout=layout_in_circle, vertex.size=vertex_wt_gh,edge.width = E(gh)$weight,edge.color=col_edges)

Module 4 (Infusing Data Driven Information): Module 2 Clusters and TSEA Network

as.data.frame(Data_Bio_Driven_with_clusters)
---
title: "NFnetFU Manual ( Dataset 1 )"
output:
  html_notebook: default
  html_document:
    df_print: paged
  pdf_document: default
---

Set Working Directiory :
```{r}
setwd("/Users/Vartika_Bisht/Individual_Project")
```

Load all source codes :
```{r}
source("Penalty_Function.R")
source("Incorporate_Groups.R")
source("Libraries_Needed.R")
source("Borrowed_Functions.R")
source("MicrobiomeAnalyst.R")
source("best_epsilon_DBSCAN.R")
```

Load Data Set :
```{r}
# Load Dataset 1
Data_Set_1 <- read.xlsx("NFnetFU_Dataset1_wih_labels.xlsx", sheetIndex = 1)

# Choose Microbiome Data
df_data1 <- as.data.frame(Data_Set_1[,3:49])
data1 <- data.matrix(df_data1)

# Create labels for prediction ( 2nd column )
label_dat <- as.numeric(factor(Data_Set_1$Class))

```

Input Dataset :
```{r}
head(df_data1)
```

Input Data for Module 1 (Features) :
```{r}
head(as.data.frame(data1))
```

Input Data for Module 1 (Output Variable: Given Variable) :
```{r}
#Labels
Data_Set_1$Class
```

Input Data for Module 1 (Output Variable: Desired Labels) :
```{r}
#Desired Labels
label_dat
```

Module 1:
```{r}
## Input : Numeric Labels(label_dat) and Microbiome Abundance Data(data1)
source("Module_1.R")
## Output : Rule Based Matrix (rules_int) , Scaled Rule Based Matrix (scaled_rules_int) and Labels (label_dat)
```

Module 1 Output (Rule Based Matrix):
```{r}
head(as.data.frame(rules_int))
```

Module 1 Output (Labels):
```{r}
as.data.frame(label_dat)[,1]
```

Module 2:
```{r}
## Input : Scaled Rule Based Matrix (scaled_rules_int)
source("Module_2.R")
## Output : Rule Based matrix with Colinearity Handled (new_data1) and PCA Loadings used to combine groups (PCA_loadings) 
```

Module 2 Output (Clusters):
```{r}
groups_we_need
```

Module 2 Output (PCA Loadings):
```{r}
head(as.data.frame(t(PCA_loadings)))
```

Module 2 Output (New Data Frame):
```{r}
head(as.data.frame(new_data1))
```

Module 3:
```{r}
## Input : Rule Based matrix with Colinearity Handled (new_data1) and PCA Loadings used to combine groups (PCA_loadings) 
source("Module_3.R")
## Output : Feature Parameters (feature_score)
```

Module 3 Output (Adaptive LASSO Results):
```{r}
head(as.data.frame(feature_scores))
```


Module 4 (TSEA - Specify Disease):
```{r}
#Diseases to look for in TSEA
disease <- c("Colorectal","Crohn","Colon")
```

Module 4 (TSEA Type of feature):
```{r}
TSEA_feature <- "Microbes"
```


Module 4 (TSEA - OTU):
If Features are OTU and need to be changed into appropriate Microbes for TSEA
```{r}
if(TSEA_feature == "OTU"){
#List of Microboes from selected features (OTU)
#OTU to Microbes
OTU_file <- read.table("/Users/Vartika_Bisht/Documents/GitHub/Microbiome_Data_Analysis/Data/NFnetFU_Dataset3_taxanomy.csv", header = 1)
OTU_index <- which(OTU_file$OTU %in% rownames(feature_scores))
selected_OTU <- OTU_file[OTU_index,]
feature_inorder <- selected_OTU$OTU
write.csv(selected_OTU,"OTU Microbes Selected Table.csv")

#Valid Microbe Names
OTU_network <- c()
taxa <- strsplit(as.character(selected_OTU$Taxonomy),";")
for(i in 1:length(taxa)){
  if(taxa[[i]][1] == "unclassified(100)"){
    OTU_network <- c(OTU_network,"unclassified")
  }else{
     for(j in rev(taxa[[i]])){
    mname <- substr( j , 1 , nchar(j)-5)
    if(mname != "unclassified"){
      OTU_network <- c(OTU_network,mname)
      break()}
    }
  }
}

Name_Change <- as.data.frame(OTU_network)
rownames(Name_Change) <- feature_inorder
write.csv(Name_Change,"Features to Microbes for TSEA.csv")

}

```

Module 4 (TSEA - Microbes of Different Taxa Level):
If Features are Microbes of Different Taxa Level and need to be changed into appropriate Microbes for TSEA
```{r}
if(TSEA_feature == "Microbes"){
  #List of Microboes from selected features (Microbes)
  Microbes_name <- substring(colnames(rules_int),4)
  #Microbes_name <- colnames(rules_int)
  OTU_network <- c()
  for(i in Microbes_name){
    n <- strsplit(i,split="[\\|.,+]+")[[1]]
    if((length(n)>1)&&(n[1] == "uncultured")||(length(n)>1)&&(n[1] == "X")){
      if((n[2] == '1')||(n[2] == '2')){
        OTU_network <- c(OTU_network,'uncultured')
      }else{ OTU_network <- c(OTU_network,n[2])}
    }else{ OTU_network <- c(OTU_network,n[1])}
  }
  feature_inorder <- colnames(rules_int)
  
  Name_Change <- as.data.frame(OTU_network)
  rownames(Name_Change) <- feature_inorder
  write.csv(Name_Change,"Features to Microbes for TSEA.csv")
}
```

Module 4 (TSEA - The names used for TSEA with the feature associated):
```{r}
head(as.data.frame(Name_Change))
```

Module 4 (TSEA - The names used for TSEA with the feature associated):
```{r}
Microbes <- unique(Name_Change$OTU_network)
Microbes <- as.character(Microbes)
Microbes
```

Module 4 (TSEA - The names used for TSEA with the feature associated):
```{r}
## Input : List of Microbes
source("Module_4(TSEA Network).R")
## Output : Network and Network Legends with Node size (Legends)
```

Module 4 (TSEA Network):
```{r}
plot(g, layout=layout_in_circle, vertex.size=vertex_wt,edge.width = E(g)$weight)
```

Module 4 (TSEA Network Legends):
```{r}
as.data.frame(Network_Info)
```

Module 4 (Infusing Data Driven Information):
TSEA Network and Adaptive LASSO Results
```{r}
## Input : TSEA Network and Adaptive LASSO Results
Cluster_Parameters <- c()
Cluster_OTU_name <- c()
for (i in Network_Info[,"Microbe Names"]) {
  index <- which(OTU_network %in% i)
  OTU <- as.character(feature_inorder[index])
  Cluster_OTU_name <- append(Cluster_OTU_name,list(OTU))
  if(length(OTU) > 1){
    OTUs_val <- c()
    for (j in OTU) {
      OTUs_val <- c( OTUs_val , abs(feature_scores[j,]) )
    }
    CP <- (sum(OTUs_val)/length(OTUs_val))[1]
  } else {
    CP <- abs(feature_scores[OTU,])
  }
  Cluster_Parameters <- c( Cluster_Parameters , CP )
}
Data_Bio_Driven <- cbind(Network_Info,Cluster_Parameters)
rownames(Data_Bio_Driven) <- NULL
colnames(Data_Bio_Driven)[4] <- "Node Score"
write.csv(Data_Bio_Driven,"Biological Network with Data Driven Results fused.csv")
## Output : Data Driven Cluster Parameters added

```

Module 4 (Infusing Data Driven Information):
TSEA Network and Adaptive LASSO Results
```{r}
as.data.frame(Data_Bio_Driven)
```

Module 4 (Infusing Data Driven Information):
Module 2 Clusters and TSEA Network
```{r}
## Input : Module 2 Clusters and TSEA Network
source("Module_4(Data Driven Network).R")
## Output : Network with Data Driven Clusters
```

Module 4 (Infusing Data Driven Information):
Module 2 Clusters and TSEA Network
```{r}
#Green Edges <- TSEA
#Red Edges <- Data Driven Cluslters
plot(gh, layout=layout_in_circle, vertex.size=vertex_wt_gh,edge.width = E(gh)$weight,edge.color=col_edges)
```

Module 4 (Infusing Data Driven Information):
Module 2 Clusters and TSEA Network
```{r}
as.data.frame(Data_Bio_Driven_with_clusters)
```









