# Metabolomics data analysis script for plasma samples

title: “Metabolomics data analysis script for plasma samples” author: “Marat Kasakin” date: “2022-06-12” output: html_document — ## Install required packages

Load files with the data output preprocessed in tracefinder

read.csv("~/20220406_MiceSPF_plasma.csv", stringsAsFactors = FALSE, skip = 0, header = TRUE) -> tf.polar.m
dim(tf.polar.m)
## [1] 40641    67

40641 67

Load GC-MS data preprocessed in Metabolite Detector

read.xlsx('/Users/marat.kasakin/Desktop/CRC project/Experimental/Mina Collaboration/2021_03_Diettary mouse study SPF/GC-MS_LCMS/20210226_3TMVJ_MT001_PLASMA_results.xlsx', sheet = 4, startRow=1, colNames = TRUE, rowNames = FALSE, detectDates = FALSE, skipEmptyRows = TRUE,
          skipEmptyCols = TRUE, rows = NULL, cols = NULL, check.names = FALSE,
          namedRegion = NULL, na.strings = "NA", fillMergedCells = FALSE) -> mouse_data1
dim(mouse_data1) # 92 17
## [1] 92 17

92 17

Load tracefinder annotation data (manually curated with the pooled plasma sample spiked with the standard mix)

read.xlsx('/Users/marat.kasakin/Desktop/CRC project/Experimental/Mina Collaboration/2021_03_Diettary mouse study SPF/GC-MS_LCMS/annot.data.spf.xlsx', sheet = 1, startRow=1, colNames = TRUE, rowNames = FALSE, detectDates = FALSE, skipEmptyRows = TRUE,
          skipEmptyCols = TRUE, rows = NULL, cols = NULL, check.names = FALSE,
          namedRegion = NULL, na.strings = "NA", fillMergedCells = FALSE) -> anot.data1
dim(anot.data1) # 174 metabolites in the rows in annotated data 5 columns
## [1] 174   5
colnames(anot.data1) # [1] "compound"                                            
## [1] "compound"                                            
## [2] "annotation.level"                                    
## [3] "ms2.(yes.or.no)"                                     
## [4] "Requires.additional.verification.with.std,.yes.or.no"
## [5] "X5"
# [2] "annotation.level"                                    
# [3] "ms2.(yes.or.no)"                                     
# [4] "Requires.additional.verification.with.std,.yes.or.no"
# [5] "X5" 

Filtering features that doesn’t have any ms2 data or confimation with standard

tf.polar.m.2 <- tf.polar.m.1 %>% select(Filename, contains(filter(anot.data1, annotation.level %in% 1:3 & `ms2.(yes.or.no)`==1)[, 1]))
tf.polar.m.2[, 1] # remove first 3 rows: standard mix, pooled sample and spiked pooled sample
## # A tibble: 19 × 1
##    Filename                                 
##    <chr>                                    
##  1 LC-EMunt_mix_50uM_01                     
##  2 MT001_LC-EMunt_Plasma_Pool_01            
##  3 MT001_LC-EMunt_Plasma_Pool_spiked        
##  4 MT001_LC-EMunt_Plasma01_Std-Diet_1       
##  5 MT001_LC-EMunt_Plasma02_Std-Diet_1       
##  6 MT001_LC-EMunt_Plasma03_Std-Diet_1       
##  7 MT001_LC-EMunt_Plasma04_Std-Diet_1       
##  8 MT001_LC-EMunt_Plasma11_HighFat_LowCarb_1
##  9 MT001_LC-EMunt_Plasma12_HighFat_LowCarb_1
## 10 MT001_LC-EMunt_Plasma13_HighFat_LowCarb_1
## 11 MT001_LC-EMunt_Plasma16_Std-Diet_1       
## 12 MT001_LC-EMunt_Plasma17_Std-Diet_1       
## 13 MT001_LC-EMunt_Plasma18_Std-Diet_1       
## 14 MT001_LC-EMunt_Plasma19_Std-Diet_1       
## 15 MT001_LC-EMunt_Plasma20_Std-Diet_1       
## 16 MT001_LC-EMunt_Plasma21_HighFat_LowCarb_1
## 17 MT001_LC-EMunt_Plasma22_HighFat_LowCarb_1
## 18 MT001_LC-EMunt_Plasma23_HighFat_LowCarb_1
## 19 MT001_LC-EMunt_Plasma24_HighFat_LowCarb_1
tf.polar.m.2[-1:-3, ] -> tf.polar.m.3
tf.polar.m.4 <- tf.polar.m.3 %>% add_column(c(rep("Standard_diet", 4), rep("HighFat_LowCarb_diet", 3), rep("Standard_diet", 5), rep("HighFat_LowCarb_diet", 4)), .before = "Filename")
tf.polar.m.5 <- tf.polar.m.4 %>% rename(Sample.Type = colnames(tf.polar.m.4)[1], Sample.Name = colnames(tf.polar.m.4)[2])
dim(tf.polar.m.5) # 16 129
## [1]  16 129

NA values filtering (variant 1 remove all metabolites with any NA)

tf.polar.m.6 <- tf.polar.m.5 %>% select_if(~ !any(is.na(.)))
dim(tf.polar.m.6) # 16 89
## [1] 16 89

NA values filtering (preserve metablolites with less then 20% NAs)

tf.polar.m.5-> tf.polar.m.5a
## NA
tf.polar.m.5a[, -which(colMeans(is.na(tf.polar.m.5a)) > 0.20)] -> tf.polar.m.5b # removed variables with more that 20% of NAs
dim(tf.polar.m.5b) # 16 100
## [1]  16 100

Normalization by total metabolites Area folowed by correction by mean value of total metabolites values of all samples

rownames(tf.polar.m.5b) <- tf.polar.m.5b$Sample.Name
## Warning: Setting row names on a tibble is deprecated.
as.data.frame(t(tf.polar.m.5b[, 3:ncol(tf.polar.m.5b)])) -> tf.polar.m.7
names(tf.polar.m.7) <- tf.polar.m.5b$Sample.Name
rownames(tf.polar.m.7) -> Metabolites
colSums(tf.polar.m.7, na.rm = T) -> Metabolite.sums
mean(Metabolite.sums) -> mean.Sums
Metabolite.sums/mean.Sums -> metabolite.sums.rel
mapply(`/`, tf.polar.m.7, metabolite.sums.rel) -> tf.polar.m.7a
as.data.frame(tf.polar.m.7a) -> tf.polar.m.8
rownames(tf.polar.m.8) <- Metabolites
dim(tf.polar.m.8) # 98 16
## [1] 98 16
as.data.frame(t(tf.polar.m.8)) -> tf.polar.m.9
dim(tf.polar.m.9) # 16 98
## [1] 16 98
cbind(tf.polar.m.5[, 1:2], tf.polar.m.9) -> tf.polar.m.10
tf.polar.m.10[, 1:2] # ok
##                                                    Sample.Type
## MT001_LC-EMunt_Plasma01_Std-Diet_1               Standard_diet
## MT001_LC-EMunt_Plasma02_Std-Diet_1               Standard_diet
## MT001_LC-EMunt_Plasma03_Std-Diet_1               Standard_diet
## MT001_LC-EMunt_Plasma04_Std-Diet_1               Standard_diet
## MT001_LC-EMunt_Plasma11_HighFat_LowCarb_1 HighFat_LowCarb_diet
## MT001_LC-EMunt_Plasma12_HighFat_LowCarb_1 HighFat_LowCarb_diet
## MT001_LC-EMunt_Plasma13_HighFat_LowCarb_1 HighFat_LowCarb_diet
## MT001_LC-EMunt_Plasma16_Std-Diet_1               Standard_diet
## MT001_LC-EMunt_Plasma17_Std-Diet_1               Standard_diet
## MT001_LC-EMunt_Plasma18_Std-Diet_1               Standard_diet
## MT001_LC-EMunt_Plasma19_Std-Diet_1               Standard_diet
## MT001_LC-EMunt_Plasma20_Std-Diet_1               Standard_diet
## MT001_LC-EMunt_Plasma21_HighFat_LowCarb_1 HighFat_LowCarb_diet
## MT001_LC-EMunt_Plasma22_HighFat_LowCarb_1 HighFat_LowCarb_diet
## MT001_LC-EMunt_Plasma23_HighFat_LowCarb_1 HighFat_LowCarb_diet
## MT001_LC-EMunt_Plasma24_HighFat_LowCarb_1 HighFat_LowCarb_diet
##                                                                         Sample.Name
## MT001_LC-EMunt_Plasma01_Std-Diet_1               MT001_LC-EMunt_Plasma01_Std-Diet_1
## MT001_LC-EMunt_Plasma02_Std-Diet_1               MT001_LC-EMunt_Plasma02_Std-Diet_1
## MT001_LC-EMunt_Plasma03_Std-Diet_1               MT001_LC-EMunt_Plasma03_Std-Diet_1
## MT001_LC-EMunt_Plasma04_Std-Diet_1               MT001_LC-EMunt_Plasma04_Std-Diet_1
## MT001_LC-EMunt_Plasma11_HighFat_LowCarb_1 MT001_LC-EMunt_Plasma11_HighFat_LowCarb_1
## MT001_LC-EMunt_Plasma12_HighFat_LowCarb_1 MT001_LC-EMunt_Plasma12_HighFat_LowCarb_1
## MT001_LC-EMunt_Plasma13_HighFat_LowCarb_1 MT001_LC-EMunt_Plasma13_HighFat_LowCarb_1
## MT001_LC-EMunt_Plasma16_Std-Diet_1               MT001_LC-EMunt_Plasma16_Std-Diet_1
## MT001_LC-EMunt_Plasma17_Std-Diet_1               MT001_LC-EMunt_Plasma17_Std-Diet_1
## MT001_LC-EMunt_Plasma18_Std-Diet_1               MT001_LC-EMunt_Plasma18_Std-Diet_1
## MT001_LC-EMunt_Plasma19_Std-Diet_1               MT001_LC-EMunt_Plasma19_Std-Diet_1
## MT001_LC-EMunt_Plasma20_Std-Diet_1               MT001_LC-EMunt_Plasma20_Std-Diet_1
## MT001_LC-EMunt_Plasma21_HighFat_LowCarb_1 MT001_LC-EMunt_Plasma21_HighFat_LowCarb_1
## MT001_LC-EMunt_Plasma22_HighFat_LowCarb_1 MT001_LC-EMunt_Plasma22_HighFat_LowCarb_1
## MT001_LC-EMunt_Plasma23_HighFat_LowCarb_1 MT001_LC-EMunt_Plasma23_HighFat_LowCarb_1
## MT001_LC-EMunt_Plasma24_HighFat_LowCarb_1 MT001_LC-EMunt_Plasma24_HighFat_LowCarb_1
tf.polar.m.10$Sample.Type -> Sample.Type

Filtering duplicated metabolites from LC-MS data that were measured in GC-MS analysis

c("Fructose", "Galactose", "Glucose 1-phosphate", "Glucose-6-phosphate", "Isocitric acid", "Isoleucine", "Mannose") -> excl.list.lc # this data will be taken from GCMS data
tf.polar.m.10[, -which(colnames(tf.polar.m.10) %in% excl.list.lc)] -> tf.polar.m.12
dim(tf.polar.m.12) # 16 93
## [1] 16 93

Univariate statistical analysis for LC-MS data

t_test <-function(x) {
  t.test(x~Sample.Type, data=tf.polar.m.10, na.rm=T)
}
apply(tf.polar.m.12[, -1:-2], 2, t_test) -> t.test.data
as.data.frame(t(sapply(t.test.data, '[', c("statistic","p.value")))) -> ttest_stat
filter(tf.polar.m.12, Sample.Type == "Standard_diet") -> Std.m
filter(tf.polar.m.12, Sample.Type == "HighFat_LowCarb_diet") -> HighF.m
apply(Std.m[, -1:-2], 2, mean, na.rm =T) -> mean.std
apply(HighF.m[ -1:-2], 2, mean, na.rm =T) -> mean.hf
as.data.frame(cbind(mean.std, mean.hf)) -> means.SPF
  transform(means.SPF, fold.changes = mean.std / mean.hf) -> fold.changes
  ttest_stat[order(as.data.frame(ttest_stat$p.value), decreasing = F), ] -> ttest.res
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
  p.adjust(ttest.res$p.value, method = "fdr") -> p.fdr
  cbind(rownames(ttest.res), ttest.res, p.fdr) -> ttest.stat
  merge(ttest.stat, fold.changes, by="row.names") -> ttest.stat.1
  ttest.stat.1[order(as.data.frame(ttest.stat.1$p.value), decreasing = F), ] -> ttest.stat.2
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
  ttest.stat.2[, c(1, 3:8)] -> ttest.stat.3
  names(ttest.stat.3) <- c("Metabolites", "statistic", "p.value", "p.fdr", "mean.StdDiet", "mean.HighFat.LowCarbDiet", "Fold.changes")
rownames(ttest.stat.3) <- ttest.stat.3$Metabolites
dim(ttest.stat.3) #91 7
## [1] 91  7
write.xlsx(ttest.stat.3,  "~/ttest-SPF_tr.xlsx")

Preprocessing GC-MS data: transposition, feature filtering

rownames(mouse_data1) <- mouse_data1$Metabolite # Set rownames as metabolites
as.data.frame(t(mouse_data1[, -1])) -> mouse_data2
dim(mouse_data2) # 16 observations and 92 features
## [1] 16 92
mouse_data2[, -grep(c('\\JP_|IS_|No match|No direct'), colnames(mouse_data2))] -> mouse_data3
dim(mouse_data3) # 16 observations and 37 metabolites
## [1] 16 37

Load sample conditions for GC-MS data from additional file

read.xlsx('/Users/marat.kasakin/Desktop/CRC project/Experimental/Mina Collaboration/2021_03_Diettary mouse study SPF/GC-MS_LCMS/Sample_condition_2.xlsx', sheet = 1, startRow=1, colNames = TRUE, rowNames = FALSE, detectDates = FALSE, skipEmptyRows = TRUE,
          skipEmptyCols = TRUE, rows = NULL, cols = NULL, check.names = FALSE,
          namedRegion = NULL, na.strings = "NA", fillMergedCells = FALSE) -> sample_condition
rownames(sample_condition) <- sample_condition$Sample
merge(sample_condition, mouse_data3, by = "row.names") -> mouse_data4
rownames(mouse_data4) <- mouse_data4$Row.names
mouse_data4[, -c(1:2)] -> mouse_data5
dim(mouse_data5) # 16 38
## [1] 16 38
colnames(mouse_data5) # GC-MS data related to metabolite derivatives with TMS, some has duplicates in GC-MS data, some metabolites has duplicates in LC-MS data
##  [1] "Diet"                           "Lactic_acid_2TMS"              
##  [3] "Glycine_2TMS"                   "Glycolic_acid_2TMS"            
##  [5] "Alanine_2TMS"                   "2-Hydroxypyridine_1TMS"        
##  [7] "Pyruvic_acid_1MEOX_1TMS"        "2-Hydroxybutyric_acid_2TMS"    
##  [9] "3-Hydroxybutyric_acid_2TMS"     "Carbonic_acid_1MeOX_2TMS"      
## [11] "Oxalic_acid_2TMS"               "Valine_2TMS"                   
## [13] "Glycerol_3TMS"                  "Leucine_2TMS"                  
## [15] "Isoleucine_2TMS"                "Glycine_3TMS"                  
## [17] "Phosphoric_acid_3TMS"           "Glyceric_acid_3TMS"            
## [19] "Serine_3TMS"                    "Succinic_acid_2TMS"            
## [21] "Threonine_3TMS"                 "Fumaric_acid_2TMS"             
## [23] "Malic_acid_3TMS"                "Erythronic_acid_4TMS"          
## [25] "Threonic_acid_4TMS"             "Methionine_2TMS"               
## [27] "Lyxose_1MeOX_4TMS_BP"           "Pyroglutamic_acid_2TMS"        
## [29] "2-oxo-glutaric_acid_1MeOX_2TMS" "1,5-Anhydro-D-glucitol_4TMS"   
## [31] "Fructose_1MEOX_5TMS_MP"         "Mannose_1MEOX_5TMS_MP"         
## [33] "Fructose_1MEOX_5TMS_BP"         "Citric_acid_4TMS"              
## [35] "Lysine_4TMS"                    "Gluconic_acid_6TMS"            
## [37] "Inositol_6TMS"                  "Hexadecanoic_acid_1TMS"

Univariate statistical analysis for GC-MS data

t_test <-function(x) {
  t.test(x~Diet, data=mouse_data5)
}
apply(mouse_data5[, -1], 2, t_test) -> t.test.data
as.data.frame(t(sapply(t.test.data, '[', c("statistic","p.value")))) -> ttest_stat.gc
filter(mouse_data5, Diet == "high fat low carb diet") -> GC.HighF.m
filter(mouse_data5, Diet == "standard diet") -> GC.Std.m
apply(GC.Std.m[, -1], 2, mean, na.rm =T) -> mean.std.gc
apply(GC.HighF.m[, -1], 2, mean, na.rm =T) -> mean.hf.gc
as.data.frame(cbind(mean.std.gc, mean.hf.gc)) -> means.gc.SPF
transform(means.gc.SPF, fold.changes = mean.std.gc / means.gc.SPF) -> fold.changes.gc
p.adjust(ttest_stat.gc$p.value, method = "fdr") -> p.fdr
cbind(ttest_stat.gc, p.fdr) -> ttest_stat.gc.1
merge(ttest_stat.gc.1, fold.changes.gc[, -3], by="row.names") -> ttest.stat.1.gc
ttest.stat.1.gc[order(as.data.frame(ttest.stat.1.gc$p.value), decreasing = F), ] -> ttest.stat.2.gc
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
names(ttest.stat.2.gc) <- c("Metabolites", "statistic", "p.value", "p.fdr", "mean.StdDiet", "mean.HighFat.LowCarbDiet", "Fold.changes")
rownames(ttest.stat.2.gc) <- ttest.stat.2.gc$Metabolites
write.xlsx(ttest.stat.2.gc,  "~/ttest-SPF_gc.xlsx")

Manual curation of the peak shapes in LC-MS data in tracefinder, comparison and filtering dublicated metabolites in LC-MS amd GC-MS data, correction metabolites names for GC-MS data (TMS derivative names were sunstituted by the original metabolite names), selection between TMS derivatives of the for same metabolite based on intensity in GCMS data

Load tables with added corrected metabolites names columns and preselected metabolites for further data analysis

read.xlsx('/Users/marat.kasakin/Desktop/CRC project/Experimental/Mina Collaboration/2021_03_Diettary mouse study SPF/GC-MS_LCMS/ttest-SPF_tr1.xlsx', sheet = 1, startRow=1, colNames = TRUE, rowNames = FALSE, detectDates = FALSE, skipEmptyRows = TRUE,
          skipEmptyCols = TRUE, rows = NULL, cols = NULL, check.names = FALSE,
          namedRegion = NULL, na.strings = "NA", fillMergedCells = FALSE) -> sel.Tr

read.xlsx('/Users/marat.kasakin/Desktop/CRC project/Experimental/Mina Collaboration/2021_03_Diettary mouse study SPF/GC-MS_LCMS/ttest-SPF_gc1.xlsx', sheet = 1, startRow=1, colNames = TRUE, rowNames = FALSE, detectDates = FALSE, skipEmptyRows = TRUE,
          skipEmptyCols = TRUE, rows = NULL, cols = NULL, check.names = FALSE,
          namedRegion = NULL, na.strings = "NA", fillMergedCells = FALSE) -> sel.GC
dim(mouse_data5) # 16 38
## [1] 16 38
mouse_data5[, c(1, which(colnames(mouse_data5) %in% sel.GC[complete.cases(sel.GC), 2]))] -> mouse_data6
match(sel.GC[complete.cases(sel.GC), 2], colnames(mouse_data6)) -> index.gc
names(mouse_data6)[index.gc] <- sel.GC[complete.cases(sel.GC), 1]
dim(mouse_data6) # 16 27
## [1] 16 27
colnames(tf.polar.m.12)[1] <- "Diet"
dim(tf.polar.m.12) # 16 93
## [1] 16 93
tf.polar.m.12[, c(1, which(colnames(tf.polar.m.12) %in% sel.Tr[complete.cases(sel.Tr), 2]))] -> tf.polar.m.13
match(sel.Tr[complete.cases(sel.Tr), 2], colnames(tf.polar.m.13)) -> index.lc
names(tf.polar.m.13)[index.lc] <- sel.Tr[complete.cases(sel.Tr), 1]
dim(tf.polar.m.13) # 16 75
## [1] 16 75

Unite GCMS and LCMS data, scaling data:

### unification of sample names and diet abbreviations:
rownames(tf.polar.m.13) -> rows.df.tr
rows.df.tr1 <- rows.df.tr %>% str_replace_all("MT001_LC-EMunt_Plasma", "") %>%
  str_replace_all("_Std-Diet_1", "") %>%
  str_replace_all("_HighFat_LowCarb_1", "")
rows.df.tr1 -> rownames(tf.polar.m.13)
rownames(mouse_data6) -> rows.df.gc
rows.df.gc1 <- rows.df.gc %>% str_replace_all("MT001_MousePlasma_HighFatLowCarb_", "") %>%
  str_replace_all("MT001_MousePlasma_Standard_", "") %>%
  str_replace_all("^3", "03") %>%
  str_replace_all("^4", "04")
rows.df.gc1[8] <- "01"
rows.df.gc1[13] <- "02"
rows.df.gc1 -> rownames(mouse_data6)
# now rownames are unified, merge table, rename rownames column
merge(mouse_data6, tf.polar.m.13, by="row.names") -> united.data.SPF
united.data.SPF[, -which(colnames(united.data.SPF) == "Diet.y")] -> united.SPF
names(united.SPF)[1:2] <- c("SampleID", "Diet")
rownames(united.SPF) <- united.SPF$SampleID
dim(united.SPF) #16 102
## [1]  16 102

PCA

dim(na.omit(united.SPF[, -1:-2])) # 7 101 too low number of observations remain
## [1]   7 100
# variant 1 remove metabolites with NA values:
united.SPF1<- united.SPF %>%
    select_if(~ !any(is.na(.)))
prcomp(united.SPF1[, -1:-2], scale = T, center = T) -> pca.SPF
diet <- as.factor(united.SPF1$Diet)
library(factoextra)
fviz_pca_ind(pca.SPF,
             palette = c("red", "blue"),
             geom.ind = "point",
             col.ind = united.SPF1$Diet,
             addEllipses = TRUE, # Concentration ellipses
             ellipse.type = "t", ellipse.level = 0.67,
             legend.title = "Diet",
             repel = TRUE,
             labelsize = 4, pointsize = 2) +
  theme(text = element_text(size = 20), legend.position = c(0.8, 0.15)) -> ind.mouse
ggpubr::ggpar(ind.mouse, subtitle = "t-distribution, 67% CI",  xlab = paste("PC1=",
        round((get_eigenvalue(pca.SPF)$variance.percent[1]), 2), "%"),
        ylab = paste("PC2=", round((get_eigenvalue(pca.SPF)$variance.percent[2]), 2), "%")) -> p
pdf("pca_ind_SPF_v1.pdf", width = 8, height = 8)
p
dev.off()
## quartz_off_screen 
##                 2
p

PCA variant 2 imputation of NA values (substitution by mean values)

# variant 2 imputation of NA values (substitution by mean values)
Meanf=function(x){
  x<-as.numeric(as.character(x)) #first convert each column into numeric from factor
  x[is.na(x)] =mean(x, na.rm=TRUE) #convert the item with NA to mean value from the column
  x #display the column
}
imp.data.SPF=data.frame(apply(united.SPF[, -1:-2], 2, Meanf))
cbind(united.SPF[1:2], imp.data.SPF) -> imp.data.SPF.1
prcomp(imp.data.SPF.1[, -1:-2], scale = T, center = T) -> pca.SPF

fviz_pca_ind(pca.SPF,
             palette = c("red", "blue"),
             geom.ind = "point",
             col.ind = imp.data.SPF.1$Diet,
             addEllipses = TRUE, # Concentration ellipses
             ellipse.type = "t", ellipse.level = 0.67,
             legend.title = "Diet",
             repel = TRUE,
             labelsize = 4, pointsize = 2) +
  theme(text = element_text(size = 20), legend.position = c(0.8, 0.15)) -> ind.mouse
ggpubr::ggpar(ind.mouse, subtitle = "t-distribution, 67% CI",  xlab = paste("PC1=",
        round((get_eigenvalue(pca.SPF)$variance.percent[1]), 2), "%"),
        ylab = paste("PC2=", round((get_eigenvalue(pca.SPF)$variance.percent[2]), 2), "%")) -> p
pdf("pca_ind_SPF_v2.pdf", width = 8, height = 8)
p
dev.off()
## quartz_off_screen 
##                 2
p

## PCA variant 3: NA cleaning and with exclusion list of metabolites

# variant 3 selected metabolites
# list of exclusions:
c("2-oxoglutaric acid", "Dopamine", "Indole", "Indole-3-propionic acid", "Niacin", "Quinolinic acid",
  "3-Hydroxyanthranilic acid", "3-(4-Hydroxyphenyl)propionic acid", "Anthranilic acid",
  "Indole-3-carboxylic acid", "Quinaldic acid", "3-Methoxytyramine", "Indole-3-glyoxylic acid") -> exclusion.list

united.SPF1[, -which(colnames(united.SPF1) %in% exclusion.list)] -> united.SPF2
dim(united.SPF2) # 16 84
## [1] 16 83
prcomp(united.SPF2[, -1:-2], scale = T, center = T) -> pca.SPF.sel
diet <- as.factor(pca.SPF.sel$Diet)
pdf("pca_ind_SPF_v3.pdf", width = 8, height = 8)
fviz_pca_ind(pca.SPF.sel,
             palette = c("red", "blue"),
             geom.ind = "point",
             col.ind = united.SPF2$Diet,
             addEllipses = TRUE, # Concentration ellipses
             ellipse.type = "t", ellipse.level = 0.67,
             legend.title = "Diet",
             repel = TRUE,
             labelsize = 4, pointsize = 2) +
  theme(text = element_text(size = 20), legend.position = c(0.8, 0.15)) -> ind.mouse
ggpubr::ggpar(ind.mouse, subtitle = "t-distribution, 67% CI",  xlab = paste("PC1=",
                                                                            round((get_eigenvalue(pca.SPF.sel)$variance.percent[1]), 2), "%"),
              ylab = paste("PC2=", round((get_eigenvalue(pca.SPF.sel)$variance.percent[2]), 2), "%")) -> p
p
dev.off()
## quartz_off_screen 
##                 2
p

## HCA and complex Heatmap construction for samples

library(RColorBrewer)
library(GlobalOptions)
library(shape)
library(ComplexHeatmap)
df.m <- scale(united.SPF2[, -1:-2]) # all means are equal to zero, all sd equal to 1
cbind(united.SPF2[, 1:2], df.m) -> united.SPF3
distmat_mouse <- dist(df.m[, -1:-2], method = "euclidean")
hcldat_mouse <- hclust(distmat_mouse, method = "average")
cor(distmat_mouse, cophenetic(hcldat_mouse)) # 0.7748869
## [1] 0.7748869
#####################
# Create the heatmap annotation for samples diet:
brewer.pal(2, "Spectral") -> color.key1
## Warning in brewer.pal(2, "Spectral"): minimal value for n is 3, returning requested palette with 3 different levels
annot_df <- data.frame(Diet = united.SPF3$Diet, stringsAsFactors = FALSE)
color_map = list(Diet = c("high fat low carb diet" = "red", "standard diet" = "blue"))
ha <- HeatmapAnnotation(df = annot_df, name = colnames(annot_df),
                        col = color_map, border = TRUE, gap = unit(1, "cm"), simple_anno_size = unit(2, "cm"))

Creation Heatmap annotation for metabolites

library(circlize)
## ========================================
## circlize version 0.4.14
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================
max(apply(united.SPF3[, -1:-2] , 2, max)) # +3.6984
## [1] 3.698494
min(apply(united.SPF3[, -1:-2] , 2, min)) # -2.6111
## [1] -2.611114
colnames(united.SPF[, -1:-2]) -> Metabolites
write.xlsx(data.frame(Metabolites), "~Metabolites.xlsx") # manually assigned metabolite groups based on KEGG and HMDB databases
read.xlsx("/Users/marat.kasakin/rworkshop-MKasakin/Metabolites.xlsx", startRow=1, colNames = TRUE, rowNames = FALSE, detectDates = FALSE, skipEmptyRows = TRUE, skipEmptyCols = TRUE, rows = NULL, cols = NULL, check.names = FALSE,
          namedRegion = NULL, na.strings = "NA", fillMergedCells = FALSE) -> Metabolites
Metabolites
##                           Metabolites                          Pathways.group
## 1                       Glycolic acid                 Carbohydrate metabolism
## 2                   2-Hydroxypyridine                             unspecified
## 3                        Pyruvic acid                 Carbohydrate metabolism
## 4               3-Hydroxybutyric acid                           ketone bodies
## 5                       Carbonic acid                             unspecified
## 6                         Oxalic acid                             unspecified
## 7                            Glycerol                             unspecified
## 8                             Leucine                         BCAA metabolism
## 9                          Isoleucine                         BCAA metabolism
## 10                    Phosphoric acid                             unspecified
## 11                      Glyceric acid                             unspecified
## 12                       Fumaric acid                               TCA cycle
## 13                         Malic acid                               TCA cycle
## 14                    Erythronic acid                 Carbohydrate metabolism
## 15                      Threonic acid                 Carbohydrate metabolism
## 16                         Methionine      Gly, Thr, Ser, Met, Cys metabolism
## 17                             Lyxose                 Carbohydrate metabolism
## 18                  Pyroglutamic acid Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 19                 2-oxoglutaric acid Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 20             1,5-Anhydro-D-glucitol                 Carbohydrate metabolism
## 21                           Fructose                 Carbohydrate metabolism
## 22                            Mannose                 Carbohydrate metabolism
## 23                        Citric acid                               TCA cycle
## 24                      Gluconic acid                 Carbohydrate metabolism
## 25                           Inositol                 Carbohydrate metabolism
## 26                  Hexadecanoic acid                             unspecified
## 27              2-Hydroxybutyric acid      Gly, Thr, Ser, Met, Cys metabolism
## 28             2-Hydroxyglutaric acid Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 29  3-(4-Hydroxyphenyl)propionic acid                             unspecified
## 30               3-Aminobutanoic acid                             unspecified
## 31          3-Hydroxyanthranilic acid      Trp, Indole, Kynurenine metabolism
## 32                3-Hydroxykyunrenine      Trp, Indole, Kynurenine metabolism
## 33                  3-Methoxytyramine                     Phe, Tyr metabolism
## 34                    Acetylcarnitine                             unspecified
## 35                      Aconitic Acid                               TCA cycle
## 36                            Adenine                  Nucleotides metabolism
## 37                            Alanine Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 38                      Phenylalanine                     Phe, Tyr metabolism
## 39            alpha-Ketoglutaric acid                               TCA cycle
## 40                   Anthranilic acid      Trp, Indole, Kynurenine metabolism
## 41                           Arginine Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 42                         Asparagine Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 43                      Aspartic acid Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 44              N-Acetylaspartic acid Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 45                             Valine                         BCAA metabolism
## 46                          Carnitine Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 47                            Choline      Gly, Thr, Ser, Met, Cys metabolism
## 48                     Isocitric acid                               TCA cycle
## 49                         Citrulline Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 50                           Creatine Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 51                         Creatinine Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 52                            Cystine      Gly, Thr, Ser, Met, Cys metabolism
## 53             Cytidine monophosphate                  Nucleotides metabolism
## 54                           Cytosine                  Nucleotides metabolism
## 55                           Dopamine                     Phe, Tyr metabolism
## 56                 D-PANTOTHENIC ACID                             unspecified
## 57            gamma-Aminobutyric acid Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 58                            Glucose                 Carbohydrate metabolism
## 59                      Glutamic acid Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 60                          Glutamine Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 61       Glutathione, oxidized (GSSG)      Gly, Thr, Ser, Met, Cys metabolism
## 62         Glutathione, reduced (GSH)      Gly, Thr, Ser, Met, Cys metabolism
## 63                            Glycine      Gly, Thr, Ser, Met, Cys metabolism
## 64                  Isobutyrylglycine                             unspecified
## 65                               HIAA      Trp, Indole, Kynurenine metabolism
## 66                          Histidine                             unspecified
## 67                     Hydroxyproline Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 68                             Indole      Trp, Indole, Kynurenine metabolism
## 69              Indole-3-acetaldehyde      Trp, Indole, Kynurenine metabolism
## 70                 Indole-3-acetamide      Trp, Indole, Kynurenine metabolism
## 71            Indole-3-carboxaldehyde      Trp, Indole, Kynurenine metabolism
## 72           Indole-3-carboxylic acid      Trp, Indole, Kynurenine metabolism
## 73                   Indole-3-ethanol      Trp, Indole, Kynurenine metabolism
## 74            Indole-3-glyoxylic acid      Trp, Indole, Kynurenine metabolism
## 75            Indole-3-propionic acid      Trp, Indole, Kynurenine metabolism
## 76                     Kynurenic acid      Trp, Indole, Kynurenine metabolism
## 77                         Kynurenine      Trp, Indole, Kynurenine metabolism
## 78                        Lactic Acid                 Carbohydrate metabolism
## 79                             Lysine Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 80                 Methylmalonic acid                             unspecified
## 81                  N-Acetylserotonin      Trp, Indole, Kynurenine metabolism
## 82                             Niacin      Trp, Indole, Kynurenine metabolism
## 83                       Nicotinamide      Trp, Indole, Kynurenine metabolism
## 84                          Ornithine Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 85                            Proline Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 86                         Putrescine Arg, Ala, Asp, Glu, Pro, Lys metabolism
## 87                     Quinaldic acid      Trp, Indole, Kynurenine metabolism
## 88                    Quinolinic acid      Trp, Indole, Kynurenine metabolism
## 89                          Sarcosine      Gly, Thr, Ser, Met, Cys metabolism
## 90                             Serine      Gly, Thr, Ser, Met, Cys metabolism
## 91                          Serotonin      Trp, Indole, Kynurenine metabolism
## 92                      Succinic acid                               TCA cycle
## 93                            Taurine      Gly, Thr, Ser, Met, Cys metabolism
## 94                          Threonine      Gly, Thr, Ser, Met, Cys metabolism
## 95                            Thymine                  Nucleotides metabolism
## 96                         Tryptophan      Trp, Indole, Kynurenine metabolism
## 97                           Tyrosine                     Phe, Tyr metabolism
## 98                             Uracil                  Nucleotides metabolism
## 99                            Uridine                  Nucleotides metabolism
## 100                           Betaine      Gly, Thr, Ser, Met, Cys metabolism
dim(Metabolites) # 100 2
## [1] 100   2
filter(Metabolites, Metabolites %in% colnames(united.SPF3)) -> Metabolites
dim(Metabolites) # 81 2
## [1] 81  2
names(Metabolites)[2] <- "Pathways_group"
col_fun1 = colorRamp2(breaks =c(-2, -0.25,  0,  0.25, 2), colors=rev(brewer.pal(5, "RdYlBu")))
col <- colorRampPalette(bias =1, rev(brewer.pal(10, "RdYlBu")))(8)
color_map2 = list(Pathways = c("ketone bodies" = "red",
                              "Carbohydrate metabolism" = "blue", 
                              "Gly, Thr, Ser, Met, Cys metabolism" = "green", 
                              "Arg, Ala, Asp, Glu, Pro, Lys metabolism" = "darkgreen",
                              "BCAA metabolism" = "yellowgreen",
                              "Phe, Tyr metabolism" = "lightgreen",
                              "TCA cycle" = "yellow", 
                              "Nucleotides metabolism" = "purple", 
                              "Trp, Indole, Kynurenine metabolism" = "orange",  
                              "unspecified" = "grey"))
annot_df2 <- data.frame(Pathways = Metabolites$Pathways_group, stringsAsFactors = FALSE)
ha1 <- HeatmapAnnotation(df = annot_df2, name = colnames(annot_df2),
                         col = color_map2, border = TRUE, gap = unit(1, "cm"), simple_anno_size = unit(2, "cm"), which = "row")

Complex heatmap construction

set.seed(4321)
Heatmap(as.matrix(t(united.SPF3[, -1:-2])),
        name = "Relative changes", column_title = "Samples",
        column_title_gp = gpar(fontsize = 20), km = 2,
        row_title_gp = gpar(fontsize = 20),
        row_names_gp = gpar(fontsize = 6, fontface = 1, fontfamily = "sans"),
        row_title = "Metabolites", col = col_fun1, column_dend_height = unit(4, "cm"),
        row_dend_width = unit(3, "cm"),
        column_km = 2,
        column_names_gp = gpar(fontsize = 10, fontface = 1, fontfamily = "sans"),
        column_names_rot = 0,
        show_column_names = T,
        heatmap_height = unit(32, "cm"),
        top_annotation = ha,
        row_dend_reorder = TRUE,
        column_dend_reorder = TRUE,
        right_annotation = ha1) -> p
pdf("HCA_heatmap_mSPF_sel_2e.pdf",  width = 14, height = 14)
p
dev.off()
## quartz_off_screen 
##                 2
p