Set Working Directiory :
setwd("/Users/Vartika_Bisht/Individual_Project")
Load all source codes :
source("Penalty_Function.R")
source("Incorporate_Groups.R")
source("Libraries_Needed.R")
source("Borrowed_Functions.R")
source("MicrobiomeAnalyst.R")
source("best_epsilon_DBSCAN.R")
Load Data Set :
# Load Dataset 1
Data_Set_1 <- read.xlsx("NFnetFU_Dataset1_wih_labels.xlsx", sheetIndex = 1)
# Choose Microbiome Data
df_data1 <- as.data.frame(Data_Set_1[,3:49])
data1 <- data.matrix(df_data1)
# Create labels for prediction ( 2nd column )
label_dat <- as.numeric(factor(Data_Set_1$Class))
Input Dataset :
head(df_data1)
Input Data for Module 1 (Features) :
head(as.data.frame(data1))
Input Data for Module 1 (Output Variable: Given Variable) :
#Labels
Data_Set_1$Class
[1] "HC" "HC" "HC" "HC" "HC" "HC" "HC" "HC" "HC" "HC" "PSC-UC"
[12] "PSC-UC" "PSC-UC" "PSC-UC" "PSC-UC" "PSC-UC" "PSC-UC" "PSC-UC" "PSC-UC" "PSC-UC" "UC" "UC"
[23] "UC" "UC" "UC" "UC" "UC" "UC" "UC" "UC"
Input Data for Module 1 (Output Variable: Desired Labels) :
#Desired Labels
label_dat
[1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3
Module 1:
## Input : Numeric Labels(label_dat) and Microbiome Abundance Data(data1)
source("Module_1.R")
|
| | 0%
|
|========= | 10%
|
|=================== | 20%
|
|============================ | 30%
|
|===================================== | 40%
|
|============================================== | 50%
|
|======================================================== | 60%
|
|================================================================= | 70%
|
|========================================================================== | 80%
|
|==================================================================================== | 90%
|
|=============================================================================================| 100%
[1] "ANFIS DONE!"
[1] "New labels have been assigned!"
[1] "Rule based matrix is saved!"
[1] "Scaled Ruled Based Matrix saved"
## Output : Rule Based Matrix (rules_int) , Scaled Rule Based Matrix (scaled_rules_int) and Labels (label_dat)
Module 1 Output (Rule Based Matrix):
head(as.data.frame(rules_int))
Module 1 Output (Labels):
as.data.frame(label_dat)[,1]
[1] 1 3 1 2 3 3 1 3 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3
Module 2:
## Input : Scaled Rule Based Matrix (scaled_rules_int)
source("Module_2.R")
[1] "Epsilon value used : 3.5"
[1] "13 cluster(s) found!"
[1] "Clustering Done!"
[1] "Feature's cluster number saved"
[1] "Grouping Highly Colinear Features Together :-"
[1] "Clubbing features in a group together"
[1] "Features Clubbed and incorporated in a new Data Frame!"
[1] "Rule Based matrix with Colinearity Handled saved"
[1] "PCA Loadings used to combine groups saved"
## Output : Rule Based matrix with Colinearity Handled (new_data1) and PCA Loadings used to combine groups (PCA_loadings)
Module 2 Output (Clusters):
groups_we_need
[[1]]
[1] "c__Gammaproteobacteria" "f__Enterobacteriaceae" "o__Enterobacteriales"
[[2]]
[1] "f__Myxococcales.0319.6G20" "s__Myxococcales.sp"
[[3]]
[1] "c__Lentisphaeria" "o__Victivallales" "p__Lentisphaerae"
[[4]]
[1] "g__Streptococcus" "f__Brevibacteriaceae" "s__Brevibacterium.sp" "c__Bacilli"
[[5]]
[1] "c__Coriobacteriia" "o__Coriobacteriales" "f__Coriobacteriaceae"
[[6]]
[1] "g__Paraprevotella" "g__Paraprevotella.1"
[[7]]
[1] "g__Erysipelotrichaceae.cc_115" "s__Erysipelotrichaceae.cc_115.sp"
[[8]]
[1] "s__Desulfovibrionaceae.sp" "g__Desulfovibrionaceae.unidentified"
[[9]]
[1] "s__Sphingomonas.sp" "g__Actinomyces" "s__Actinomyces" "s__Roseburia.sp"
[[10]]
[1] "s__Tepidimonas.sp" "g__Tepidimonas"
[[11]]
[1] "g__Rothia" "s__mucilaginosa."
[[12]]
[1] "g__Shewanella" "f__Shewanellaceae"
[[13]]
[1] "s__Parvimonas.sp" "g__Parvimonas"
Module 2 Output (PCA Loadings):
head(as.data.frame(t(PCA_loadings)))
Module 2 Output (New Data Frame):
head(as.data.frame(new_data1))
Module 3:
## Input : Rule Based matrix with Colinearity Handled (new_data1) and PCA Loadings used to combine groups (PCA_loadings)
source("Module_3.R")
[1] "Feature scores computed and saved"
## Output : Feature Parameters (feature_score)
Module 3 Output (Adaptive LASSO Results):
head(as.data.frame(feature_scores))
Module 4 (TSEA - Specify Disease):
#Diseases to look for in TSEA
disease <- c("Colorectal","Crohn","Colon")
Module 4 (TSEA Type of feature):
TSEA_feature <- "Microbes"
Module 4 (TSEA - OTU): If Features are OTU and need to be changed into appropriate Microbes for TSEA
if(TSEA_feature == "OTU"){
#List of Microboes from selected features (OTU)
#OTU to Microbes
OTU_file <- read.table("/Users/Vartika_Bisht/Documents/GitHub/Microbiome_Data_Analysis/Data/NFnetFU_Dataset3_taxanomy.csv", header = 1)
OTU_index <- which(OTU_file$OTU %in% rownames(feature_scores))
selected_OTU <- OTU_file[OTU_index,]
feature_inorder <- selected_OTU$OTU
write.csv(selected_OTU,"OTU Microbes Selected Table.csv")
#Valid Microbe Names
OTU_network <- c()
taxa <- strsplit(as.character(selected_OTU$Taxonomy),";")
for(i in 1:length(taxa)){
if(taxa[[i]][1] == "unclassified(100)"){
OTU_network <- c(OTU_network,"unclassified")
}else{
for(j in rev(taxa[[i]])){
mname <- substr( j , 1 , nchar(j)-5)
if(mname != "unclassified"){
OTU_network <- c(OTU_network,mname)
break()}
}
}
}
Name_Change <- as.data.frame(OTU_network)
rownames(Name_Change) <- feature_inorder
write.csv(Name_Change,"Features to Microbes for TSEA.csv")
}
Module 4 (TSEA - Microbes of Different Taxa Level): If Features are Microbes of Different Taxa Level and need to be changed into appropriate Microbes for TSEA
if(TSEA_feature == "Microbes"){
#List of Microboes from selected features (Microbes)
Microbes_name <- substring(colnames(rules_int),4)
#Microbes_name <- colnames(rules_int)
OTU_network <- c()
for(i in Microbes_name){
n <- strsplit(i,split="[\\|.,+]+")[[1]]
if((length(n)>1)&&(n[1] == "uncultured")||(length(n)>1)&&(n[1] == "X")){
if((n[2] == '1')||(n[2] == '2')){
OTU_network <- c(OTU_network,'uncultured')
}else{ OTU_network <- c(OTU_network,n[2])}
}else{ OTU_network <- c(OTU_network,n[1])}
}
feature_inorder <- colnames(rules_int)
Name_Change <- as.data.frame(OTU_network)
rownames(Name_Change) <- feature_inorder
write.csv(Name_Change,"Features to Microbes for TSEA.csv")
}
Module 4 (TSEA - The names used for TSEA with the feature associated):
head(as.data.frame(Name_Change))
Module 4 (TSEA - The names used for TSEA with the feature associated):
Microbes <- unique(Name_Change$OTU_network)
Microbes <- as.character(Microbes)
Microbes
[1] "Gammaproteobacteria" "Enterobacteriaceae" "Enterobacteriales" "Prevotellaceae"
[5] "Paraprevotellaceae" "Myxococcales" "Lentisphaeria" "Victivallales"
[9] "Lentisphaerae" "Streptococcus" "Coriobacteriia" "Coriobacteriales"
[13] "Coriobacteriaceae" "Paraprevotella" "Blautia" "Erysipelotrichaceae"
[17] "symbiosum" "Desulfovibrionaceae" "Brevibacteriaceae" "Brevibacterium"
[21] "ramosum" "Sphingomonas" "Tepidimonas" "Microbacteriaceae"
[25] "hathewayi" "Staphylococcus" "adolescentis" "Rothia"
[29] "citroniae" "mucilaginosa" "distasonis" "Actinomyces"
[33] "stutzeri" "Alteromonadales" "Shewanella" "Shewanellaceae"
[37] "Bacilli" "Roseburia" "Parvimonas" "fragilis"
Module 4 (TSEA - The names used for TSEA with the feature associated):
## Input : List of Microbes
source("Module_4(TSEA Network).R")
[1] "----Microbiome Analyst----"
[1] "Init MicrobiomeAnalyst!"
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- 0:00:01 --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- 0:00:02 --:--:-- 0
43 217k 43 97938 0 0 32386 0 0:00:06 0:00:03 0:00:03 32376
100 217k 100 217k 0 0 69256 0 0:00:03 0:00:03 --:--:-- 69234
[1] "Loaded files from MetaboAnalyst web-server."
[11%] Downloaded 16018 bytes...
[23%] Downloaded 32402 bytes...
[35%] Downloaded 48786 bytes...
[47%] Downloaded 65170 bytes...
[59%] Downloaded 81554 bytes...
[71%] Downloaded 97938 bytes...
[83%] Downloaded 114322 bytes...
[95%] Downloaded 130706 bytes...
[100%] Downloaded 136784 bytes...
[1] "Loaded files from MetaboAnalyst web-server."
[1] "No matches were found in the selected taxon set library!"
[1] "Mixed-level taxa set was selected!"
[1] "Mix Taxa TSEA Results Calculated"
[1] "Mix Taxa TSEA Disease Specific Results Calculated"
[1] "Calculating Adjacency Matrix for Network"
## Output : Network and Network Legends with Node size (Legends)
Module 4 (TSEA Network):
plot(g, layout=layout_in_circle, vertex.size=vertex_wt,edge.width = E(g)$weight)
Module 4 (TSEA Network Legends):
as.data.frame(Network_Info)
Module 4 (Infusing Data Driven Information): TSEA Network and Adaptive LASSO Results
## Input : TSEA Network and Adaptive LASSO Results
Cluster_Parameters <- c()
Cluster_OTU_name <- c()
for (i in Network_Info[,"Microbe Names"]) {
index <- which(OTU_network %in% i)
OTU <- as.character(feature_inorder[index])
Cluster_OTU_name <- append(Cluster_OTU_name,list(OTU))
if(length(OTU) > 1){
OTUs_val <- c()
for (j in OTU) {
OTUs_val <- c( OTUs_val , abs(feature_scores[j,]) )
}
CP <- (sum(OTUs_val)/length(OTUs_val))[1]
} else {
CP <- abs(feature_scores[OTU,])
}
Cluster_Parameters <- c( Cluster_Parameters , CP )
}
Data_Bio_Driven <- cbind(Network_Info,Cluster_Parameters)
rownames(Data_Bio_Driven) <- NULL
colnames(Data_Bio_Driven)[4] <- "Node Score"
write.csv(Data_Bio_Driven,"Biological Network with Data Driven Results fused.csv")
## Output : Data Driven Cluster Parameters added
Module 4 (Infusing Data Driven Information): TSEA Network and Adaptive LASSO Results
as.data.frame(Data_Bio_Driven)
Module 4 (Infusing Data Driven Information): Module 2 Clusters and TSEA Network
## Input : Module 2 Clusters and TSEA Network
source("Module_4(Data Driven Network).R")
[1] "Calculating Edges and Nodes to be added for the Data Driven Network"
[1] "Final Fused Network Saved!"
[1] "Final Fused Network Cluster Information Saved!"
## Output : Network with Data Driven Clusters
Module 4 (Infusing Data Driven Information): Module 2 Clusters and TSEA Network
#Green Edges <- TSEA
#Red Edges <- Data Driven Cluslters
plot(gh, layout=layout_in_circle, vertex.size=vertex_wt_gh,edge.width = E(gh)$weight,edge.color=col_edges)
Module 4 (Infusing Data Driven Information): Module 2 Clusters and TSEA Network
as.data.frame(Data_Bio_Driven_with_clusters)