## Data consist of 150 samples of organic molecular mixtures analyzed by
## pyrolysis gas chromatography-mass spectrometry (py-GC-MS).
## Each data set has the scan numbers in the rows and the mass to charge ratio (m/z)
## values in the columns.
## The data values are the raw intensities as measured by py-GC-MS.
## Each sample has either an abiotic (A) or biotic (B) origin.
## Purpose is to determine classification rules to predict whether the samples are
## biotic or abiotic.
## Purpose is also to determine the features, m/z and scan numbers, that discriminate
## between the biotic and the abiotic groups.
## The data are preprocessed before we apply random forest classification model.
## In this file we create an igraph regarding the importance variables
## from the random forest model and the Monte Carlo simulations.
library(dplyr) # for dataframe computation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(MALDIquant) # for chemometrics processing
##
## This is MALDIquant version 1.22.2
## Quantitative Analysis of Mass Spectrometry Data
## See '?MALDIquant' for more information about this package.
library(caret) # for machine learning
## Loading required package: ggplot2
## Loading required package: lattice
library(mlr3) # for machine learning
library("mlr3verse") # for machine learning
library("mlr3learners")# for machine learning
library("mlr3tuning") # for machine learning
## Loading required package: paradox
library("data.table") # for rbindlist
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library("ggplot2") # for plots
library(RColorBrewer) # for color palettes
library(igraph) # for igraphs
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
#Data preparation
setwd("C:/Desktop/Templeton_grant_research/Second_paper_material/Data2022")
species=c(rep("A",5),"B", "A","A","B","B","A","C", rep("B",5),rep("C",4),rep("B",4),"A",
rep("B",11),"A","A","B","A","A","B","B","A",rep("B",3),"A","B",rep("A",6),
rep("B",3),"A","A","B","B",rep("A",4),"B",rep("A",5),"B",rep("A",10),
rep("B",4),rep("A",5),rep("B",3),rep("A",3),rep("B",4),rep("A",8),"B",
"A","A","B","C","C","A",rep("B",3),"A","B", "B","A","B",rep("A",3),"B","A",
rep("A",7),"B","A","B")
species2=as.numeric(as.factor(species))
ind=which(species2=="3")
species_n=species2[-ind]
#Reading in the data is based on the code found in:
## "How to import multiple .csv files simultaneously in R and create a data frame"
## by Dahna, A., datascience+, March 09, 2019
#https://datascienceplus.com/how-to-import-multiple-csv-files-simultaneously-in-r-and-create-a-data-frame/
species_names <- list.files()
species_names=species_names[-ind]
z=lapply(species_names, read.delim) #read in all of 134 datasets
setwd("C:/Desktop/Templeton_grant_research/Second_paper_material/Data2023")
species_names2 <- list.files()
z2=lapply(species_names2, read.delim) #read in all of 16 datasets
NN=700 #number of m/z values
mass=seq(50,NN,1) #m/z
MM=3240 #number of scans
ndim=length(species_names)
ndim2=length(species_names2)
#Scan number:m/z:intensity values for 134 samples
M=list()
for(i in 1:ndim){
colnames(z[[i]])="mass"
#remove commas
z[[i]]=data.frame(do.call("rbind", strsplit(as.character(z[[i]]$mass), ",",
fixed = TRUE)))
z[[i]]=data.frame(lapply(z[[i]],as.numeric))
colnames(z[[i]])=c("scan",as.character(seq(50,NN,1)))
z[[i]]=z[[i]] %>% slice(1:MM) #selects the first MM rows
M[[i]]=z[[i]]
}
#Scan number:m/z:intensity values for 16 samples
M2=list()
for(i in 1:ndim2){
colnames(z2[[i]])="mass"
#remove commas
z2[[i]]=data.frame(do.call("rbind", strsplit(as.character(z2[[i]]$mass), ",",
fixed = TRUE)))
z2[[i]]=data.frame(lapply(z2[[i]],as.numeric))
colnames(z2[[i]])=c("scan",as.character(seq(50,NN,1)))
z2[[i]]=z2[[i]] %>% slice(1:MM) #selects the first MM rows
M2[[i]]=z2[[i]]
}
M_new=c(M,M2)
N=length(M_new)
species_n2=rep(2,16)
species_names_new=c(species_names,species_names2)
y=c(species_n,species_n2)
y=factor(y,labels=c("A","B"))
#########################################################################################
#Preprocessing
#Detect the significant peaks as local max above four times the signal to noise ratio
MZ=151 #number of m/z values to use
#Create Chromatograms for each sample and each m/z value inside each sample
sample_list=list()
sample_location_list=list()
suppressWarnings({
for (i in 1:N){
S=list()
for(j in 1:MZ){
S[[j]] = createMassSpectrum(mass=seq(1,MM,1), intensity=unlist(M_new[[i]][,j+1]),
metaData=list(name="Chrom"))
}
chrom = transformIntensity(S,method="sqrt") #stabilize the variance
chrom = smoothIntensity(chrom, method="MovingAverage",halfWindowSize=5)#smooth the data
chrom = removeBaseline(chrom, method="SNIP") #remove the baseline
# Put the processed chromatograms back into a dataframe
processed_chrom_list=list()
for (k in 1:MZ){
processed_chrom_list[[k]] = as.numeric(intensity(chrom[[k]]))
}
processed_mass_dataframe = as.data.frame(do.call(rbind, processed_chrom_list))
Ma=max(processed_mass_dataframe)
Mi=min(processed_mass_dataframe)
#Normalize across sample
processed_mass_dataframe = t((processed_mass_dataframe - Mi)/(Ma - Mi))
processed_mass_dataframe = as.data.frame(processed_mass_dataframe)
S2=list()
for(t in 1:MZ){
S2[[t]] = createMassSpectrum(mass=seq(1,MM,1),
intensity=processed_mass_dataframe[, t],
metaData=list(name="Chrom_normalized"))
}
peaks = detectPeaks(S2, method="MAD", halfWindowSize=20, SNR=4)
peak_list=list()
for (tt in 1:MZ){
v=numeric(MM)
scan_number=mass(peaks[[tt]])
v[scan_number] = intensity(peaks[[tt]])
peak_list[[tt]] = v
}
processed_peaks = t(as.data.frame(do.call(rbind, peak_list)))
row.names(processed_peaks)=c(paste0("R", 1:MM))
colnames(processed_peaks)=c(paste0("M", 50:(MZ+50-1)))
processed_peaks2=as.data.frame(processed_peaks) %>%
mutate(bin = cut(seq(1,MM,1),breaks=180,dig.lab = 6)) %>% # bin the peaks by scan #
group_by(bin) %>%
summarise_all(max)
sample_list[[i]] = processed_peaks2
sample_location_list[[i]] = processed_peaks
}
})
#Scan # and mass spectrum for each sample
mass_scan_list_loc=list()
for(i in 1:N){
sm_loc=as.numeric(unlist(sample_location_list[[i]]))
mass_scan_list_loc[[i]]=sm_loc
}
#Sample vs mass/scan numbers
#Put the samples into a dataframe
data_mass_scan_loc = do.call(rbind, mass_scan_list_loc)
bin2=as.character(seq(1,3240,1))
MS=as.character(seq(50,(MZ+50-1),1))
colnames(data_mass_scan_loc) = paste(outer(bin2, MS, paste, sep = ';'))
data_mass_scan_new=as.data.frame(ifelse(data_mass_scan_loc > 0,1,0))
MZ_name=c(paste0(";",50:(MZ+50-1)))
MZ_name2=c(paste0(50:(MZ+50-1)))
MZ_name3=c(paste0(".",50:(MZ+50-1)))
bin3 = unique(cut(seq(1,MM,1),breaks=180,dig.lab = 6))
#Finding the endpoints in bin4 is based on code found in
#"Obtain endpoints from interval that is factor variable"
#https://stackoverflow.com/questions/40665240/obtain-endpoints-from-interval-that-is-factor-variable
#Stack Overflow:
bin4=unique(as.numeric(unlist( strsplit( gsub( "[][(]" , "", levels(bin3)) , ","))))
#Determine the mean scan number for each bin and each
#m/z value across the training samples
mean_scan_name = list()
for (i in 1:MZ){
data_mass_scan_new2 = data_mass_scan_new %>% select(ends_with(MZ_name[i]))
scan_name=sub(basename(colnames(data_mass_scan_new2)),pattern = MZ_name[i],
replacement = "" , fixed = TRUE)
colnames(data_mass_scan_new2) = scan_name #name the columns with the scan numbers
count_elements=function(x){
y=sum(x)
}
#Count the number of elements for each scan number across the training samples
n_elements=as.numeric(apply(as.matrix(data_mass_scan_new2),2,count_elements))
#Repeat the nonzero scan numbers with its frequency
vec=as.numeric(rep(colnames(data_mass_scan_new2), times=n_elements))
scan_dataframe=list() #scan numbers for each bin for the ith m/z value selected
indd1=1:floor(bin4[2])
ind_L1=which(vec %in% indd1)
Lt1=vec[ind_L1]
#Mean number of scan number in the first bin
scan_mean1=if(length(Lt1)>0){round(mean(Lt1))
}else {-1}
DD1=data.frame(matrix(ncol = 1, nrow = 1))
colnames(DD1)= paste(outer(as.character(scan_mean1),MZ_name2[i], paste, sep = ';'))
scan_dataframe[[1]]=DD1
for (j in 2:(length(bin4)-1)){
indd=(floor(bin4[j])+1):floor(bin4[j+1])
ind_L=which(vec %in% indd)
Lt=vec[ind_L]
#Mean number of scan number in each bin
scan_mean=if(length(Lt)>0){round(mean(Lt))
}else {-j} #to give empty columns a unique name
DD=data.frame(matrix(ncol = 1, nrow = 1))
colnames(DD)= paste(outer(as.character(scan_mean),MZ_name2[i],
paste, sep = ';'))
scan_dataframe[[j]]=DD
}
#The mean scan number for each bin of scans for the ith m/z value and each sample
mean_scan_name[[i]]=do.call(cbind, scan_dataframe)
}
# To get the column names of the interleaved m/z values and scan numbers as columns
data_mass_scan_interl=do.call(cbind, mean_scan_name)
names_new=colnames(data_mass_scan_interl)
#Scan # and mass spectrum for each sample
mass_scan_list=list()
for(i in 1:N){
sm=as.numeric(unlist(sample_list[[i]]))[181:(180*(MZ+1))]
mass_scan_list[[i]]=sm
}
#Sample vs mass/scan numbers
data_preprocessed = do.call(rbind, mass_scan_list) #put the samples into a dataframe
colnames(data_preprocessed ) = names_new
data_preprocessed = as.data.frame(data_preprocessed )
count_nonzero=function(x){(length(which(x > 0)))/(dim(data_preprocessed)[2])}
#Ratio of nonzero feature values for each observation
Perc_Nnonzero=apply(data_preprocessed ,1,count_nonzero)
data_preprocessed = data_preprocessed %>% mutate(Perc_Nnonzero)
colnames(data_preprocessed) = make.names(colnames(data_preprocessed))
#Removing variables with near zero variance, nearZeroVar, is based on the book
#"Applied Predictive Modeling" by Kuhn, M. and Johnson, K.,
#Springer, New York, 2013
#Detect features with near zero variance
near_zero_variance = nearZeroVar(data_preprocessed)
#Remove features with near zero variance
data_preprocessed = data_preprocessed[, -near_zero_variance]
dim(data_preprocessed) #new dimension
## [1] 150 9165
#Contains all samples with the selected features and corresponding y-values (A or B)
training_transformed=data.frame(data_preprocessed,y=y)
#########################################################################################
#Machine learning
#The machine learning R-code is based on the mlr3 library and the book
## https://mlr3book.mlr-org.com/, "Flexible and Robust Machine Learning Using mlr3 in R"
#by Lang, M. et al
#Create a task
task=as_task_classif(training_transformed, target = "y", positive="B")
#Using stratified random sampling for each resampling
task$col_roles$stratum = "y"
#Determine importance variables
#Use the results from the random forest model in previous file with
#mtry=1688 and num.trees=2658
learner2 = lrn("classif.ranger", id = "rf",predict_type="prob",importance = "impurity")
learner2$param_set$values$mtry = 1688
learner2$param_set$values$num.trees = 2658
set.seed(99)
learner2$train(task)
## Growing trees.. Progress: 74%. Estimated remaining time: 11 seconds.
#Importance variables in ranked order
filter = flt("importance", learner = learner2)
filter$calculate(task)
## Growing trees.. Progress: 75%. Estimated remaining time: 10 seconds.
dff=as.data.table(filter)
#Monte Carlo (MC) simulations using random forest from previous file
df.list=readRDS("C:/Desktop/Templeton_grant_research/Second_paper_material/Materials_for_submission/New_excel_4.3.3/df.list_Monte_Carlo.RDS")
df.test = readRDS("C:/Desktop/Templeton_grant_research/Second_paper_material/Materials_for_submission/New_excel_4.3.3/df.test_Monte_Carlo.RDS")
#####Determine the importance variables that have the lowest average rank across the
#Monte Carlo samples and that are among the original importance variables from the
#random forest model
K=(dim(training_transformed)[2]-1)
variables=colnames(training_transformed)[1:K]
ID=seq(1,K,1)
variables2=data.frame(variables,ID)
feature_rank = list() #Arrange by features and include their ranks
for (i in 1:50){
ind2=2*i-1
fn = as.character(unlist(as.data.frame(df.list)[,ind2]))
feature_rank2 = list()
for (j in 1:K){
ID2=which(variables2[,1] %in% fn[j])
feature_rank2[[j]] = data.frame(ID2,j)
colnames(feature_rank2[[j]]) = c("Feature","Rank")
}
feature_rank3 = do.call(rbind, feature_rank2) #put each feature in ranked order
feature_rank[[i]] = (feature_rank3 %>% arrange(Feature))[,2]
}
#Rank for each feature across the MC samples
feature_rank4 = data.frame(ID,do.call(cbind, feature_rank))
#Calculate the mean rank for each feature across the MC samples
feature_rank4 =feature_rank4 %>% mutate(mean_rank = apply(feature_rank4[,2:51],1,mean))
#Order the features according to the lowest mean rank across the MC samples
feature_rank5= feature_rank4 %>% arrange(mean_rank)
DIMM=60
vari = feature_rank5[1:DIMM,1]
#Ranked importance variables average across the MC samples
average_imp=variables2[vari,]
# The first DIMM importance variables in the original sample
base=as.character(unlist(dff[1:DIMM,1]))
#Lowest ranked importance variables in the original sample that are also among the lowest
#ranked importance variables across the MC samples
Av=intersect(base,average_imp[,1])
y2=c(1,1,1,1,1,3,1,1,3,3,1,rep(2,9),1,3,3,rep(2,9),1,1,3,1,1,2,2,1,3,2,3,1,2,rep(1,6),
2,3,2,1,1,2,3,rep(1,4),2,rep(1,5),2,rep(1,10),3,2,2,2,rep(1,5),2,2,2,1,1,1,3,3,3,3,
rep(1,8),3,1,1,3,1,2,3,2,1,2,2,1,3,1,1,1,2,rep(1,8),3,1,3,rep(2,13),3,2,3)
y2=factor(y2,labels=c("A","B","C"))
#A: Abiotic
#B: Contemporary biotic
#C: Altered biotic
indexA=which(y2=="A")
indexB=which(y2=="B")
indexC=which(y2=="C")
lA=length(indexA) #number of abiotic samples
lB=length(indexB) #number of contemporary biotic samples
lC=length(indexC) #number of altered biotic samples
abiotic=species_names_new[indexA] #abiotic sample names
bioticB=species_names_new[indexB] #contemporary biotic sample names
bioticC=species_names_new[indexC] #altered biotic sample names
########################################################################################
L=50
#Create the aggregated indicator matrix
IN=matrix(rep(0,K*K),nrow=K)
#Create the ID vector
Id_vector=numeric(K)
for (i in 1:L){
Id_matrix=matrix(rep(0,K*K),nrow=K)
Id_vector2=numeric(K)
fn=df.list[[(2*i-1)]][1:60] #60 highest importance variables in order
ID2=which(variables2[,1] %in% fn )
Id_matrix[ID2,ID2] = 1
Id_vector2[ID2]=1
IN = IN+Id_matrix #number of times each pair of variables occurs in the same sample
Id_vector=Id_vector+Id_vector2 #number of times each variable occurs in the sample
}
#Adjacency matrix for elements that occur together 90% of the times
cons_A=matrix(rep(0,K*K),nrow=K)
for(i in 1: K){
for (j in i:K){
if(IN[i,j]>=45) {
cons_A[i,j]=1
cons_A[j,i]=1
}
else{
cons_A[i,j]=0
cons_A[j,i]=0
}
}
}
g=graph_from_adjacency_matrix(cons_A, mode = c("undirected"))
lg=largest_cliques(g) #only one of the largest maximal clique
#elements in the largest maximal clique
clique_num(g) #number of elements in the largest maximal clique
## [1] 11
var1=variables2[unlist(lg),1] #features in the largest maximal clique
var2=variables2[which(Id_vector>=45),1] #features that occur in 90% of the samples
var2 %in% var1
## [1] TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE FALSE
## [13] TRUE TRUE TRUE
var1 %in% var2
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
df1=intersect(var1,Av)
df2=intersect(var2,Av)
##################################################################################
#######Calculate the proportion of samples that have the importance variables
######found above in df2
data_list=list()
for(i in 1:length(df2)){
data_list2=training_transformed %>% select(df2[i]| "y")
data_list[[i]] = data_list2[,1]
}
data_AB = do.call(cbind, data_list)
colnames(data_AB) = df2
#Put non-zero values = 1
data_AB2=ifelse(data_AB[,1:(length(df2))]>0,1,0)
data_AB3=data.frame(apply(data_AB2,2,mean))
colnames(data_AB3)=c("Prop. total")
###############################################################################
#########Create an igraph
#Determine the indexing of the features in variables2 that occur in df2
vari=numeric(length(df2))
for (i in 1:length(df2)){
vari[i]=which(variables2[,1]==df2[i])}
vari=sort(vari)
#Elements that are found at least 90% of the times. The color of the links indicates the proportion
#of times the elements are found together.
adj=IN[vari,vari]/50 #Adjacency matrix
g2=graph_from_adjacency_matrix(adj, mode="undirected", weighted=TRUE)
g2=simplify(g2,remove.loops = TRUE)
#Assignment of color is based on code found in
#"How can I assign a color range to edges in igraph plot in R based on edge attributes?"
#in stack Overflow:
#https://stackoverflow.com/questions/66974308/how-can-i-assign-a-color-range-to-edges-in-igraph-plot-in-r-based-on-edge-attrib
n_bins = 4
sn_color = brewer.pal(n = 8, name = 'OrRd')[c(2,5,7,8)]
node_size = setNames(data_AB3[,1],as.character(seq(1,length(df2),1)))
var_index=paste0("", 1:length(df2), ": ",df2)
var_index = paste0(var_index)
#Figure 5E
# Elements index in df2 that are in the largest maximal clique
clique_match_index = match(intersect(df2,df1),df2)
col_igraph = rep("black",20)
blue_col = col_igraph[clique_match_index] = "#2166AC"
set.seed(99)
#par(xpd=T,family="sans", ps=9)
plot(g2, edge.width=E(g2)$weight+ min(E(g2)$weight) + 1,
edge.color=sn_color[cut(E(g2)$weight, breaks=n_bins )],
vertex.size=50*node_size, vertex.color=sn_color[cut(diag(adj),
breaks=c(0.80, 0.85, 0.90, 0.95,1) )], main=""
)
legend(-2,1.2 , levels(cut(E(g2)$weight, breaks=n_bins )),
col=sn_color,fill=sn_color,cex=1)
legend(1.05, 1.2, var_index,text.col =col_igraph ,cex=1)

#Vertices: the features with the highest importance values and that occur in at least 90%
#of the Monte Carlo samples
#Edge Color: proportion of times two features are found together in the MC samples
#Vertex color: proportion of times a feature is found in the MC samples
#Vertex Size (Prop.): proportion of the original sample with this particular feature