Creating graphs for pyrolysis gas chromatography-mass spectrometry data using logistic regression with elastic net penalty

## Data consist of 150 samples of organic molecular mixtures analyzed by 
## pyrolysis gas chromatography-mass spectrometry (py-GC-MS).
## Each data set has the scan numbers in the rows and the mass to charge ratio (m/z) 
## values in the columns.
## The data values are the raw intensities as measured by py-GC-MS.
## Each sample has either an abiotic (A) or biotic (B) origin. 
## Purpose is to determine classification rules to predict whether the samples are
## biotic or abiotic.
## Purpose is also to determine the features, m/z and scan numbers, that discriminate
## between the biotic and the abiotic groups.
## In this file we create different graphs regarding the significant variables from
## logistic regression with elastic net penalty.



library(dplyr)         # for dataframe computation

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(MALDIquant)    # for chemometrics processing

## 
## This is MALDIquant version 1.22.2
## Quantitative Analysis of Mass Spectrometry Data
##  See '?MALDIquant' for more information about this package.

library(caret)         # for machine learning

## Loading required package: ggplot2

## Loading required package: lattice

library(mlr3)          # for machine learning
library("mlr3verse")   # for machine learning
library("mlr3learners")# for machine learning
library("mlr3tuning")  # for machine learning

## Loading required package: paradox

library("data.table")  # for rbindlist

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(glmnet)        # for logistic regression with elastic net penalty

## Loading required package: Matrix

## Loaded glmnet 4.1-8

library(rgl)           # for 3D graphs
library(plotly)        # for interactive graphs

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

#Data preparation
setwd("C:/Desktop/Templeton_grant_research/Second_paper_material/Data2022")
species=c(rep("A",5),"B", "A","A","B","B","A","C", rep("B",5),rep("C",4),rep("B",4),"A",
          rep("B",11),"A","A","B","A","A","B","B","A",rep("B",3),"A","B",rep("A",6),
          rep("B",3),"A","A","B","B",rep("A",4),"B",rep("A",5),"B",rep("A",10),
          rep("B",4),rep("A",5),rep("B",3),rep("A",3),rep("B",4),rep("A",8),"B",
          "A","A","B","C","C","A",rep("B",3),"A","B", "B","A","B",rep("A",3),"B","A",
          rep("A",7),"B","A","B")
species2=as.numeric(as.factor(species))
ind=which(species2=="3")
species_n=species2[-ind]


#Reading in the data is based on the code found in:
## "How to import multiple .csv files simultaneously in R and create a data frame"
## by Dahna, A., datascience+, 03/09/2019
#https://datascienceplus.com/how-to-import-multiple-csv-files-simultaneously-in-r-and-create-a-data-frame/
species_names <- list.files()
species_names=species_names[-ind]
z=lapply(species_names, read.delim)  #read in all of 134 datasets

setwd("C:/Desktop/Templeton_grant_research/Second_paper_material/Data2023")
species_names2 <- list.files()
z2=lapply(species_names2, read.delim)  #read in all of 16 datasets

NN=700 #number of m/z values
mass=seq(50,NN,1) #m/z 
MM=3240 #number of scans
ndim=length(species_names)
ndim2=length(species_names2)

#Scan number:m/z:intensity values for 134 samples
M=list()
for(i in 1:ndim){
    colnames(z[[i]])="mass"
    #remove commas
    z[[i]]=data.frame(do.call("rbind", strsplit(as.character(z[[i]]$mass), ",", 
                                                fixed = TRUE)))
    z[[i]]=data.frame(lapply(z[[i]],as.numeric))
    colnames(z[[i]])=c("scan",as.character(seq(50,NN,1)))
    z[[i]]=z[[i]] %>% slice(1:MM)       #selects the first MM rows  
    M[[i]]=z[[i]]
}

#Scan number:m/z:intensity values for 16 samples
M2=list()
for(i in 1:ndim2){
    colnames(z2[[i]])="mass"
    #remove commas
    z2[[i]]=data.frame(do.call("rbind", strsplit(as.character(z2[[i]]$mass), ",",
                                                 fixed = TRUE)))
    z2[[i]]=data.frame(lapply(z2[[i]],as.numeric))
    colnames(z2[[i]])=c("scan",as.character(seq(50,NN,1)))
    z2[[i]]=z2[[i]] %>% slice(1:MM)       #selects the first MM rows
    M2[[i]]=z2[[i]]
}

M_new=c(M,M2)
N=length(M_new)
species_n2=rep(2,16)
species_names_new=c(species_names,species_names2)

y=c(species_n,species_n2)
y=factor(y,labels=c("A","B"))

#####################################################################################
#Preprocessing
#Detect the significant peaks as local max above four times the signal to noise ratio

MZ=151  #number of m/z values to use
#Create Chromatograms for each sample and each m/z value inside each sample
sample_list=list()
sample_location_list=list()
suppressWarnings({
for (i in 1:N){
  S=list()
    for(j in 1:MZ){
        log_int=ifelse(M_new[[i]][,j+1]>0, log10(M_new[[i]][,j+1]),0)
           S[[j]] = createMassSpectrum(mass=seq(1,MM,1), log_int,
           metaData=list(name="Chrom"))  
     }
   chrom = smoothIntensity(S, method="MovingAverage",halfWindowSize=5)#smooth the data
   chrom = removeBaseline(chrom, method="SNIP") #remove the baseline

   # Put the processed chromatograms back into a dataframe
   processed_chrom_list=list()
      for (k in 1:MZ){
          processed_chrom_list[[k]] = as.numeric(intensity(chrom[[k]]))
       }

   processed_mass_dataframe = as.data.frame(do.call(rbind, processed_chrom_list))
   Ma=max(processed_mass_dataframe)
   Mi=min(processed_mass_dataframe)
   #Normalize across sample
   processed_mass_dataframe = t((processed_mass_dataframe - Mi)/(Ma - Mi))
   processed_mass_dataframe = as.data.frame(processed_mass_dataframe)
   S2=list()
      for(t in 1:MZ){
         S2[[t]] = createMassSpectrum(mass=seq(1,MM,1),
                                      intensity=processed_mass_dataframe[, t],
         metaData=list(name="Chrom_normalized"))  
      }

    peaks = detectPeaks(S2, method="MAD", halfWindowSize=20, SNR=4)
    peak_list=list()
    for (tt in 1:MZ){
         v=numeric(MM)
         scan_number=mass(peaks[[tt]])
         v[scan_number] = intensity(peaks[[tt]])
           peak_list[[tt]] = v
     }
     processed_peaks = t(as.data.frame(do.call(rbind, peak_list)))
   row.names(processed_peaks)=c(paste0("R", 1:MM))
   colnames(processed_peaks)=c(paste0("M", 50:(MZ+50-1)))
     processed_peaks2=as.data.frame(processed_peaks) %>% 
   mutate(bin = cut(seq(1,MM,1),breaks=180,dig.lab = 6)) %>%  # bin the peaks by scan #
   group_by(bin) %>%
   summarise_all(max)
     sample_list[[i]] = processed_peaks2
   sample_location_list[[i]] = processed_peaks
}
})


#Scan # and mass spectrum for each sample
mass_scan_list_loc=list()
for(i in 1:N){
    sm_loc=as.numeric(unlist(sample_location_list[[i]])) 
    mass_scan_list_loc[[i]]=sm_loc
}

#Sample vs mass/scan numbers
#Put the samples into a dataframe
data_mass_scan_loc = do.call(rbind, mass_scan_list_loc) 

bin2=as.character(seq(1,3240,1))
MS=as.character(seq(50,(MZ+50-1),1))

colnames(data_mass_scan_loc) = paste(outer(bin2, MS, paste, sep = ';'))

data_mass_scan_new=as.data.frame(ifelse(data_mass_scan_loc > 0,1,0))
MZ_name=c(paste0(";",50:(MZ+50-1)))
MZ_name2=c(paste0(50:(MZ+50-1)))
MZ_name3=c(paste0(".",50:(MZ+50-1)))
bin3 = unique(cut(seq(1,MM,1),breaks=180,dig.lab = 6))

#Finding the endpoints in bin4 is based on code found in 
#"Obtain endpoints from interval that is factor variable"

#https://stackoverflow.com/questions/40665240/obtain-endpoints-from-interval-that-is-factor-variable
#Stack Overflow:
bin4=unique(as.numeric(unlist( strsplit( gsub( "[][(]" , "", levels(bin3)) , ","))))

#Determine the mean scan number for each bin and each 
#m/z value across the training samples
mean_scan_name = list()
for (i in 1:MZ){
      data_mass_scan_new2 = data_mass_scan_new %>% select(ends_with(MZ_name[i])) 
      scan_name=sub(basename(colnames(data_mass_scan_new2)),pattern = MZ_name[i], 
                    replacement = "" , fixed = TRUE)
      colnames(data_mass_scan_new2) = scan_name #name the columns with the scan numbers
      count_elements=function(x){
        y=sum(x)
    }
   #Count the number of elements for each scan number across the training samples
   n_elements=as.numeric(apply(as.matrix(data_mass_scan_new2),2,count_elements)) 
   #Repeat the nonzero scan numbers with its frequency
   vec=as.numeric(rep(colnames(data_mass_scan_new2), times=n_elements)) 
   scan_dataframe=list() #scan numbers for each bin for the ith m/z value selected 
   indd1=1:floor(bin4[2])
   ind_L1=which(vec %in% indd1)
   Lt1=vec[ind_L1]
   #Mean number of scan number in the first bin
   scan_mean1=if(length(Lt1)>0){round(mean(Lt1)) 
             }else {-1}
   DD1=data.frame(matrix(ncol = 1, nrow =  1)) 
   colnames(DD1)= paste(outer(as.character(scan_mean1),MZ_name2[i], paste, sep = ';'))
   scan_dataframe[[1]]=DD1
     for (j in 2:(length(bin4)-1)){
             indd=(floor(bin4[j])+1):floor(bin4[j+1])
        ind_L=which(vec %in% indd)
        Lt=vec[ind_L]
              #Mean number of scan number in each bin
        scan_mean=if(length(Lt)>0){round(mean(Lt)) 
                           }else {-j} #to give empty columns a unique name
        DD=data.frame(matrix(ncol = 1, nrow =  1)) 
              colnames(DD)= paste(outer(as.character(scan_mean),MZ_name2[i], 
                                        paste, sep = ';'))
              scan_dataframe[[j]]=DD
        }
   #The mean scan number for each bin of scans for the ith m/z value and each sample    
   mean_scan_name[[i]]=do.call(cbind, scan_dataframe) 
}

# To get the column names of the interleaved m/z values and scan numbers as columns
data_mass_scan_interl=do.call(cbind, mean_scan_name) 
names_new=colnames(data_mass_scan_interl)

#Scan# and mass spectrum for each sample
mass_scan_list=list()
for(i in 1:N){
   sm=as.numeric(unlist(sample_list[[i]]))[181:(180*(MZ+1))]     
   mass_scan_list[[i]]=sm
}

#Sample vs mass/scan numbers
data_preprocessed = do.call(rbind, mass_scan_list) #put the samples into a dataframe
colnames(data_preprocessed ) = names_new
data_preprocessed  = as.data.frame(data_preprocessed )

count_nonzero=function(x){(length(which(x > 0)))/(dim(data_preprocessed)[2])}
#Ratio of nonzero feature values for each observation
Perc_Nnonzero=apply(data_preprocessed ,1,count_nonzero) 

data_preprocessed  = data_preprocessed  %>% mutate(Perc_Nnonzero)
colnames(data_preprocessed) = make.names(colnames(data_preprocessed))

#Removing variables with near zero variance, nearZeroVar, is based on the book
#"Applied Predictive Modeling" by Kuhn, M. and Johnson, K.,
#Springer, New York, 2013 

#Detect features with near zero variance
near_zero_variance = nearZeroVar(data_preprocessed) 
#Remove features with near zero variance
data_preprocessed  = data_preprocessed[, -near_zero_variance] 
dim(data_preprocessed) #new dimension

## [1]  150 8204

#Contains all samples with the selected features and corresponding y-values (A or B)
training_transformed=data.frame(data_preprocessed,y=y)


#########################################################################################
#Machine learning
#The machine learning R-code is based on the mlr3 library and the book
## https://mlr3book.mlr-org.com/, "Flexible and Robust Machine Learning Using mlr3 in R"
#by Lang, M. et al

#Create a task 
task=as_task_classif(training_transformed, target = "y", positive="B")

#Using stratified random sampling for each resampling
task$col_roles$stratum = "y"


  
#########################################################################################
#Tune alpha in the elastic net model on the 150 training data

set.seed(99)
resampling_cv10 = rsmp("cv", folds = 10)  #For 10-fold cross validation
resampling_cv10$instantiate(task)

#Create the learner
learner = lrn("classif.cv_glmnet", id = "glmnet", predict_type="prob")

graph = po("removeconstants")  %>>% po("scale")  %>>% learner
plot(graph)

graph_learner = as_learner(graph)
#graph_learner$param_set$ids()

#glmnet
graph_learner$param_set$values$glmnet.alpha = to_tune(p_dbl(0,1))

graph_learner$id = "graph_learner"

RS = tnr("random_search")

future::plan("multisession")

#using stratified random sampling inside each fold
instance = tune(
  tuner = RS,
  task =  task,
  learner = graph_learner,
  resampling = resampling_cv10,
  measure = msr("classif.ce"),
  term_evals = 20
)

instance$result_y

## classif.ce 
##  0.1133929

instance$result

######################################################################################
#Tune the overall penalty parameter in the elastic net model
set.seed(99)
dim_en = dim(training_transformed)[2]-1
cv_lambda=cv.glmnet(as.matrix(training_transformed[,1:dim_en]),y,
                    alpha=instance$result$glmnet.alpha,family="binomial")
plot(cv_lambda)

best_lambda=cv_lambda$lambda.min

elastic_mod=glmnet(as.matrix(training_transformed[,1:dim_en]),y,
                   alpha=instance$result$glmnet.alpha,family="binomial")
elastic_coef = predict(elastic_mod, type = "coefficients",
                       alpha=instance$result$glmnet.alpha, 
                       s = best_lambda,family="binomial")

ind_p = which(c(elastic_coef[,1] > 0))
ind_n = which(c(elastic_coef[,1] < 0))

var_p=colnames(training_transformed[,ind_p - 1])
var_n = colnames(training_transformed[,ind_n - 1])
Av=c(var_p,var_n) #non-zero coefficients

AvE = data.frame(Av)

K=(dim(training_transformed)[2]-1)

y2=c(1,1,1,1,1,3,1,1,3,3,1,rep(2,9),1,3,3,rep(2,9),1,1,3,1,1,2,2,1,3,2,3,1,2,rep(1,6),
     2,3,2,1,1,2,3,rep(1,4),2,rep(1,5),2,rep(1,10),3,2,2,2,rep(1,5),2,2,2,1,1,1,3,3,3,3,
     rep(1,8),3,1,1,3,1,2,3,2,1,2,2,1,3,1,1,1,2,rep(1,8),3,1,3,rep(2,13),3,2,3)
y2=factor(y2,labels=c("A","B","C"))
#A: Abiotic
#B: Contemporary biotic
#C: Altered biotic

indexA=which(y2=="A")
indexB=which(y2=="B")
indexC=which(y2=="C")
lA=length(indexA) #number of abiotic samples
lB=length(indexB) #number of contemporary biotic samples
lC=length(indexC) #number of altered biotic samples
abiotic=species_names_new[indexA] #abiotic sample names
bioticB=species_names_new[indexB] #contemporary biotic sample names
bioticC=species_names_new[indexC] #altered biotic sample names

####################################################################################
#######Calculate the proportion of samples that have the significant variables sorted
#######by abiotic, biotic contemporary, and altered biotic 
calc_proportion=function(Av) {
#Intensity values for the variables in the vector Av for each species
dA=list()
dB=list()
dC=list()
for(i in 1:length(Av)){
    dA2=training_transformed %>% select(Av[i]| "y")   %>% filter(y2=="A")
    dA[[i]] = dA2[,1]
    dB2=training_transformed %>% select(Av[i]| "y")   %>% filter(y2=="B")
    dB[[i]] = dB2[,1]
    dC2=training_transformed %>% select(Av[i]| "y")   %>% filter(y2=="C")
    dC[[i]] = dC2[,1]
}

data_A = do.call(cbind, dA)
colnames(data_A) = Av
data_B = do.call(cbind, dB)
colnames(data_B) = Av
data_C = do.call(cbind, dC)
colnames(data_C) = Av

#Put non-zero values = 1 
data_AA=ifelse(data_A[,1:(length(Av))]>0,1,0)
data_BB=ifelse(data_B[,1:(length(Av))]>0,1,0)
data_CC=ifelse(data_C[,1:(length(Av))]>0,1,0)

data_ABC = rbind(data_AA,data_BB,data_CC)

data_stat=data.frame(apply(data_AA,2,mean),apply(data_BB,2,mean),apply(data_CC,2,mean),
                     apply(data_ABC,2,mean))
colnames(data_stat)=c("Pr A", "Pr B(contemporary) ","Pr B(altered)", "Prop.total")
return(data_stat)
}

data_stat=calc_proportion(Av)

#########################################################################################
######Find the distribution of proportion of abiotic samples, biotic samples,
######contemporary biotic, and altered biotic samples
allA=training_transformed %>% filter(y=="A")
allB=training_transformed %>% filter(y=="B")
allB_Cont=training_transformed[,1:(K-1)]%>% slice(indexB) #omitting Perc_Nnonzero feature 
allB_Alt=training_transformed[,1:(K-1)]%>% slice(indexC)

allA_bin=ifelse(allA[,1:(K-1)]>0,1,0)
allB_bin=ifelse(allB[,1:(K-1)]>0,1,0)
allB_Cont_bin=ifelse(allB_Cont>0,1,0)
allB_Alt_bin=ifelse(allB_Alt>0,1,0)

# Proportion of abiotic samples that contain each feature 
prop_A= apply(allA_bin,2,mean) 
# Proportion of biotic samples that contain each feature 
prop_B= apply(allB_bin,2,mean) 
# Proportion of contemporary biotic samples that contain each feature 
prop_B_Cont= apply(allB_Cont_bin,2,mean) 
# Proportion of altered biotic samples that contain each feature 
prop_B_Alt = apply(allB_Alt_bin,2,mean)  

#Median number of features for abiotic, biotic,
#contemporary biotic, and altered biotic samples, respectively
median(prop_A)

## [1] 0.1466667

median(prop_B)

## [1] 0.1866667

median(prop_B_Cont)

## [1] 0.1730769

median(prop_B_Alt)

## [1] 0.173913

##########################################################################################
#Proportion of samples that contain the significant variables 

D_A = data_stat[,1]
D_B_Cont = data_stat[,2]
D_B_Alt = data_stat[,3]
D_B = (data_stat[,2] * lB+ data_stat[,3] * lC)/(lB+lC)

DIM=length(Av)
D_A_array=array(c(D_A), dim=c(DIM,1))
D_B_array=array(c(D_B), dim=c(DIM,1))
D_B_Cont_array=array(c(D_B_Cont), dim=c(DIM,1))
D_B_Alt_array=array(c(D_B_Alt), dim=c(DIM,1))

#These significant variables are on average found in the following proportion
#of samples:
mean(D_A)

## [1] 0.1494026

mean(D_B)

## [1] 0.2177061

mean(D_B_Cont)

## [1] 0.2121933

mean(D_B_Alt)

## [1] 0.2301699

#######################################################################################
#####Distribution of the significant variables 

allA2=allA %>% select(all_of(Av))
allB2=allB  %>%  select(all_of(Av))
allB_Cont2 =  allB_Cont %>% select(all_of(Av))
allB_Alt2 = allB_Alt  %>% select(all_of(Av))

Nrow = lA+(lB+lC)*2
size_Dbar = Nrow*length(Av)
Dbar=matrix(numeric(size_Dbar),nrow=Nrow)
Dbar=as.data.frame(Dbar)

#DIM=length(Av)
for(i in 1:DIM){
    DbarA=data.frame(allA2[,i])
    colnames(DbarA)=c(Av[i])

    DbarB=data.frame(allB2[,i])
    colnames(DbarB)=c(Av[i])

    DbarBCont=data.frame(allB_Cont2[,i])
    colnames(DbarBCont)=c(Av[i])

    DbarBAlt=data.frame(allB_Alt2[,i])
    colnames(DbarBAlt)=c(Av[i])

    Dbar[,i]=rbind(DbarA,DbarB,DbarBCont, DbarBAlt)
}

Type=c(rep("A",lA),rep("B",(lB+lC)),rep("B(Cont.)",lB),rep("B(Alt.)",lC))
Dbar=data.frame(Dbar,Type)
colnames(Dbar)=c(Av,"Type")

########################################################################################
#Split the scan# and m/z values for the significant variables into two separate
#components for different number of features.

data_split=function(Av){
datadf3=Av

#Splitting a vector string based on code found in "Splitting Strings in R programming – 
#strsplit() method":
# https://www.geeksforgeeks.org/splitting-strings-in-r-programming-strsplit-method/
datadf_st3 = strsplit(datadf3, split = "[.]+")
datadf_st23=list()
for (i in 1:length(datadf_st3)){
datadf_st23[[i]] = strsplit(datadf_st3[[i]],split = '""')
}

#Get the first element of a list based on code found in
#"R list get first item of each element":
#https://stackoverflow.com/questions/44176908/r-list-get-first-item-of-each-element , 
#stack overflow
datadf_st213= unlist(sapply(datadf_st23, function(x) x[1]))

#gsub based on code found in "Remove Character From String in R":
#https://sparkbyexamples.com/r-programming/remove-character-from-string-in-r/ 
#by Nelamali,N., March 27, 2024
xx2 = as.numeric(gsub('[X]','',datadf_st213))
yy2= as.numeric(unlist(sapply(datadf_st23, function(x) x[2])))
return(list(xx2=xx2,yy2=yy2))
}
xx2=as.numeric(unlist(data_split(Av)[1]))
yy2=as.numeric(unlist(data_split(Av)[2]))

#######################################################################################

split_name = strsplit(species_names_new, split = "[.]+")
split_name2=list()
for (i in 1:length(split_name)){
split_name2[[i]] = strsplit(split_name[[i]],split = '""')
}

#Name of samples
split_name_new= unlist(sapply(split_name2, function(x) x[1]))
split_name_new =  unlist( strsplit( gsub( "[][(]" , "", split_name_new) , "3d"))

data_new=training_transformed %>%  select(all_of(Av))

y3=factor(y2,labels=c("Abiotic","Biotic (contemporary)","Biotic (altered)"))
#PCA
pr.out3=prcomp(data_new, scale=T)
summary(pr.out3)

## Importance of components:
##                            PC1     PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     8.82836 6.73614 6.33474 5.71950 5.45437 4.75861 4.36269
## Proportion of Variance 0.06984 0.04066 0.03596 0.02931 0.02666 0.02029 0.01705
## Cumulative Proportion  0.06984 0.11050 0.14646 0.17577 0.20243 0.22272 0.23977
##                            PC8     PC9    PC10   PC11    PC12    PC13   PC14
## Standard deviation     4.28569 4.16523 4.06772 3.8964 3.87372 3.82535 3.7790
## Proportion of Variance 0.01646 0.01555 0.01483 0.0136 0.01345 0.01311 0.0128
## Cumulative Proportion  0.25623 0.27178 0.28660 0.3002 0.31365 0.32676 0.3396
##                           PC15    PC16    PC17    PC18    PC19    PC20   PC21
## Standard deviation     3.65627 3.54854 3.52922 3.43400 3.38464 3.34227 3.3076
## Proportion of Variance 0.01198 0.01128 0.01116 0.01057 0.01027 0.01001 0.0098
## Cumulative Proportion  0.35154 0.36282 0.37398 0.38455 0.39481 0.40482 0.4146
##                           PC22    PC23    PC24   PC25    PC26    PC27    PC28
## Standard deviation     3.28301 3.23131 3.21064 3.2039 3.14830 3.13143 3.12375
## Proportion of Variance 0.00966 0.00936 0.00924 0.0092 0.00888 0.00879 0.00874
## Cumulative Proportion  0.42429 0.43364 0.44288 0.4521 0.46096 0.46974 0.47849
##                           PC29    PC30    PC31    PC32    PC33    PC34    PC35
## Standard deviation     3.11074 3.04923 3.02259 2.99243 2.98409 2.96061 2.94870
## Proportion of Variance 0.00867 0.00833 0.00819 0.00802 0.00798 0.00785 0.00779
## Cumulative Proportion  0.48716 0.49549 0.50368 0.51170 0.51968 0.52753 0.53532
##                           PC36    PC37   PC38    PC39    PC40    PC41    PC42
## Standard deviation     2.89876 2.89418 2.8737 2.85587 2.83607 2.81730 2.79318
## Proportion of Variance 0.00753 0.00751 0.0074 0.00731 0.00721 0.00711 0.00699
## Cumulative Proportion  0.54285 0.55036 0.5578 0.56507 0.57228 0.57939 0.58638
##                           PC43    PC44    PC45   PC46    PC47    PC48    PC49
## Standard deviation     2.78432 2.76977 2.75335 2.7145 2.69979 2.68626 2.67783
## Proportion of Variance 0.00695 0.00687 0.00679 0.0066 0.00653 0.00647 0.00643
## Cumulative Proportion  0.59332 0.60020 0.60699 0.6136 0.62013 0.62659 0.63302
##                          PC50    PC51    PC52    PC53   PC54    PC55    PC56
## Standard deviation     2.6728 2.64983 2.63283 2.62550 2.5886 2.57907 2.55670
## Proportion of Variance 0.0064 0.00629 0.00621 0.00618 0.0060 0.00596 0.00586
## Cumulative Proportion  0.6394 0.64571 0.65192 0.65810 0.6641 0.67006 0.67592
##                           PC57    PC58    PC59    PC60    PC61    PC62    PC63
## Standard deviation     2.54214 2.52094 2.51580 2.51093 2.48917 2.47516 2.46118
## Proportion of Variance 0.00579 0.00569 0.00567 0.00565 0.00555 0.00549 0.00543
## Cumulative Proportion  0.68171 0.68741 0.69308 0.69873 0.70428 0.70977 0.71520
##                          PC64    PC65    PC66    PC67    PC68    PC69    PC70
## Standard deviation     2.4544 2.43513 2.42587 2.39763 2.36865 2.35923 2.35601
## Proportion of Variance 0.0054 0.00531 0.00527 0.00515 0.00503 0.00499 0.00497
## Cumulative Proportion  0.7206 0.72591 0.73118 0.73633 0.74136 0.74635 0.75132
##                           PC71    PC72    PC73    PC74    PC75    PC76    PC77
## Standard deviation     2.34175 2.33598 2.31150 2.29455 2.28546 2.28106 2.27162
## Proportion of Variance 0.00491 0.00489 0.00479 0.00472 0.00468 0.00466 0.00462
## Cumulative Proportion  0.75623 0.76112 0.76591 0.77063 0.77531 0.77997 0.78460
##                           PC78    PC79    PC80    PC81    PC82    PC83    PC84
## Standard deviation     2.26162 2.25533 2.23413 2.21893 2.20248 2.18513 2.17883
## Proportion of Variance 0.00458 0.00456 0.00447 0.00441 0.00435 0.00428 0.00425
## Cumulative Proportion  0.78918 0.79374 0.79821 0.80262 0.80697 0.81125 0.81550
##                           PC85    PC86    PC87    PC88    PC89    PC90    PC91
## Standard deviation     2.16777 2.15893 2.14572 2.14186 2.11967 2.11148 2.10216
## Proportion of Variance 0.00421 0.00418 0.00413 0.00411 0.00403 0.00399 0.00396
## Cumulative Proportion  0.81971 0.82389 0.82801 0.83212 0.83615 0.84014 0.84410
##                           PC92    PC93    PC94    PC95   PC96    PC97    PC98
## Standard deviation     2.08185 2.06646 2.06089 2.05271 2.0317 2.01344 2.00954
## Proportion of Variance 0.00388 0.00383 0.00381 0.00378 0.0037 0.00363 0.00362
## Cumulative Proportion  0.84799 0.85181 0.85562 0.85940 0.8631 0.86673 0.87035
##                           PC99   PC100   PC101   PC102   PC103   PC104   PC105
## Standard deviation     1.98949 1.97936 1.96526 1.95104 1.94293 1.93680 1.92728
## Proportion of Variance 0.00355 0.00351 0.00346 0.00341 0.00338 0.00336 0.00333
## Cumulative Proportion  0.87389 0.87740 0.88086 0.88427 0.88766 0.89102 0.89435
##                          PC106   PC107   PC108   PC109   PC110   PC111   PC112
## Standard deviation     1.92370 1.90916 1.88323 1.87992 1.87763 1.85515 1.84587
## Proportion of Variance 0.00332 0.00327 0.00318 0.00317 0.00316 0.00308 0.00305
## Cumulative Proportion  0.89766 0.90093 0.90411 0.90727 0.91043 0.91352 0.91657
##                          PC113   PC114   PC115   PC116   PC117   PC118   PC119
## Standard deviation     1.84265 1.82282 1.81941 1.79608 1.79087 1.77435 1.77104
## Proportion of Variance 0.00304 0.00298 0.00297 0.00289 0.00287 0.00282 0.00281
## Cumulative Proportion  0.91961 0.92259 0.92555 0.92845 0.93132 0.93414 0.93695
##                          PC120   PC121   PC122   PC123  PC124   PC125   PC126
## Standard deviation     1.75900 1.73295 1.71807 1.71170 1.7021 1.68289 1.66856
## Proportion of Variance 0.00277 0.00269 0.00264 0.00263 0.0026 0.00254 0.00249
## Cumulative Proportion  0.93972 0.94241 0.94506 0.94768 0.9503 0.95282 0.95531
##                          PC127   PC128   PC129   PC130   PC131   PC132   PC133
## Standard deviation     1.66002 1.64531 1.62756 1.61960 1.59355 1.58198 1.57037
## Proportion of Variance 0.00247 0.00243 0.00237 0.00235 0.00228 0.00224 0.00221
## Cumulative Proportion  0.95778 0.96021 0.96258 0.96493 0.96721 0.96945 0.97166
##                          PC134   PC135   PC136   PC137  PC138   PC139   PC140
## Standard deviation     1.56218 1.54608 1.52363 1.49754 1.4933 1.47947 1.45876
## Proportion of Variance 0.00219 0.00214 0.00208 0.00201 0.0020 0.00196 0.00191
## Cumulative Proportion  0.97385 0.97599 0.97807 0.98008 0.9821 0.98404 0.98594
##                          PC141   PC142   PC143   PC144   PC145   PC146  PC147
## Standard deviation     1.43328 1.42813 1.39696 1.37224 1.31842 1.27620 1.2480
## Proportion of Variance 0.00184 0.00183 0.00175 0.00169 0.00156 0.00146 0.0014
## Cumulative Proportion  0.98778 0.98961 0.99136 0.99305 0.99461 0.99607 0.9975
##                          PC148   PC149     PC150
## Standard deviation     1.22467 1.15479 3.327e-15
## Proportion of Variance 0.00134 0.00119 0.000e+00
## Cumulative Proportion  0.99881 1.00000 1.000e+00

#########################################################################################
######Create a dataframe with scan#, m/z, intensity coordinates, sample type, and name for
#the significant variables
Av3=Av

data_Av3=training_transformed %>% select(all_of(Av3))
sample_import=data.frame(species_names_new,y2,data_Av3)

#Coordinates for scan# and m/z for the significant variables
cord=data.frame(xx2,yy2)
colnames(cord)=c("scan #", "m/z")

#Intensity values for the significant variables for each sample
ss=list()
for(i in 1:150){
    ss[[i]] = sample_import[i,]
}

sample_frame = as.data.frame(t(do.call(rbind,ss)))
colnames(sample_frame)=species_names_new
#The significant variables with their scan# and m/z coordinates and 
#intensity value for each sample
cord2=cbind(cord,sample_frame[3:(length(Av3)+2),]) 
#Code for converting dataframe to numeric based on code found in 
#"Code for converting entire data frame to numeric":
#https://stackoverflow.com/questions/60288057/code-for-converting-entire-data-frame-to-numeric ,
#stack overflow
cord2=mutate_all(cord2, function(x) as.numeric(as.character(x)))

#Create a dataframe for abiotic samples with scan#, m/z value coordinates, 
#intensity, sample type
data_mA=list()
for (i in 1:lA){
      data_mA[[i]]= data.frame(xx2,yy2,as.numeric(data_Av3[indexA[i],]))
      colnames(data_mA[[i]])=c("Scan","Mass_to_charge_ratio","Intensity")
}
typeA=factor(rep("Abiotic",lA*length(Av3)))
data3DA=do.call(rbind,data_mA)
data3DA=data.frame(data3DA,typeA)
colnames(data3DA)=c("Scan","Mass_to_charge_ratio","Intensity","Type")
NameA=factor(rep(split_name_new[indexA],each=length(Av3)))#abiotic sample names

#Create a dataframe for contemporary biotic samples with scan#, m/z value coordinates, 
#intensity, sample type
data_mB=list()
for (i in 1:lB){
     data_mB[[i]]= data.frame(xx2,yy2,as.numeric(data_Av3[indexB[i],]))
     colnames(data_mB[[i]])=c("Scan","Mass_to_charge_ratio","Intensity")
} 
typeB=factor(rep("Biotic (cont.)",lB*length(Av3)))
data3DB=do.call(rbind,data_mB)
data3DB=data.frame(data3DB,typeB)
colnames(data3DB)=c("Scan","Mass_to_charge_ratio","Intensity","Type")
NameB=factor(rep(split_name_new[indexB],each=length(Av3)))

#Create a dataframe for altered biotic samples with scan#, m/z value coordinates, 
#intensity, sample type
data_mC=list()
for (i in 1:lC){
     data_mC[[i]]= data.frame(xx2,yy2,as.numeric(data_Av3[indexC[i],]))
    colnames(data_mC[[i]])=c("Scan","Mass_to_charge_ratio","Intensity")
}
typeC=factor(rep("Biotic (alt.)",lC*length(Av3)))
data3DC=do.call(rbind,data_mC)
data3DC=data.frame(data3DC,typeC)
colnames(data3DC)=c("Scan","Mass_to_charge_ratio","Intensity","Type")
NameC=factor(rep(split_name_new[indexC],each=length(Av3)))

NameS=c(NameA, NameB, NameC)
data3D=rbind(data3DA, data3DB, data3DC)
#dataframe with scan#, m/z, intensity coordinates, sample type, and name for
#the significant variables
data3D=cbind(data3D,NameS)
ind_all=which(data3D[,3]>0)


###################################################################################
#Figure S2
##3D-PCA
cols=c("#B15928","#33A02C","#1F78B4")
Cols=function(vec){
cols=c("#B15928","#33A02C","#1F78B4")
return(cols[as.numeric(as.factor(vec))])
}

library(rgl)
knitr::knit_hooks$set(webgl = hook_webgl)

windowsFonts(A = windowsFont("sans")) 
plot3d(pr.out3$x[,1:3], col = Cols(y2), type = 's', radius = .2  )
text3d(pr.out3$x[,1:3],
       texts=split_name_new,
       ps= 0.6, pos=3, family="A")
 
bbox3d(color = c("grey", "black"), emission = "grey", 
         specular = "grey", shininess = 5, alpha = 0.8)

###############################################################################
#Barplot of the significant variables 
#Figure S5

par(mfcol = c(4, 1), mar = numeric(4),oma = c(5, 4, .5, .5),
    mai = c(0.05, 0.2, 0.3, 0.2),
    mgp = c(2, .6, 0),family="sans", ps=9)

barplot(D_A_array,beside=TRUE,ylim=c(0,1),col="#B15928" ,las=2,axes=FALSE,
        main="Abiotic samples",width=5)
text(-2.2,1.12,label=substitute(paste(bold(('a')))),col="black",xpd=NA)
axis(2L)
box()

barplot(D_B_array,beside=TRUE,ylim=c(0,1),col="#6A3D9A",las=2,axes=FALSE,
        main="Biotic samples")
text(-2.2,1.12,label=substitute(paste(bold(('b')))), col="black",xpd=NA)
axis(2L)
box()

barplot(D_B_Cont_array,beside=TRUE,ylim=c(0,1),col="#33A02C" ,las=2,axes=FALSE,
        main="Contemporary biotic samples")
text(-2.2,1.12,label=substitute(paste(bold(('c')))), col="black",xpd=NA)
axis(2L)
box()

barplot(D_B_Alt_array,beside=TRUE,ylim=c(0,1),col="#1F78B4",las=2,axes=FALSE,
        main="Altered biotic samples")
text(-2.2,1.12,label=substitute(paste(bold(('d')))), col="black",xpd=NA)
axis(2L)
box()

mtext("Proportion of samples", side = 2, outer = TRUE, line = 2.2, cex=0.9)
mtext("Significant features", side = 1, outer = TRUE, line = 2.2, cex=0.9)

#dev.off()

#############################################################################
#Plot scan#:m/z:intensity for the significant variables with sample names
#for each point
#Figure S7
fig=plot_ly(data=data3D[ind_all,], x=~Scan,y=~Mass_to_charge_ratio,
            z=~Intensity, type="scatter3d", mode="markers",marker=list(size = 3), color=~Type,colors=c("#B15928","#33A02C","#1F78B4"),text = ~paste("", NameS))
fig=fig %>% layout(legend = list(x = 0.4, y = 0.95,orientation = 'h',
                                 itemsizing='constant',bgcolor ="ghostwhite"))
fig