Load the necessary packages

library(tidyverse)
library(dplyr)
library(janitor)
library(FactoMineR)
library(factoextra)
library(pheatmap)
library(stringr)

Import SPS Data

SPS <- read.csv(
                "C:/Users/anifo/OneDrive/Desktop/Thomas/SPS.csv",
                fileEncoding = "latin1",
                check.names = FALSE,
                stringsAsFactors = FALSE
)

names(SPS)
## [1] "Peak No."    "Peak ID"     "Ret Time"    "Height"      "Area"       
## [6] "Conc µg/10g"
names(SPS)[names(SPS) == "Peak No."] <- "Peak_No"
names(SPS)[names(SPS) == "Peak ID"] <- "Peak_ID"
names(SPS)[names(SPS) == "Ret Time"] <- "Ret_Time"
names(SPS)[names(SPS) == "Conc µg/10g"] <- "Conc_ug_10g"

SPS <- SPS[rowSums(is.na(SPS)) != ncol(SPS), ]
names(SPS)
## [1] "Peak_No"     "Peak_ID"     "Ret_Time"    "Height"      "Area"       
## [6] "Conc_ug_10g"
glimpse(SPS)
## Rows: 10
## Columns: 6
## $ Peak_No     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 11
## $ Peak_ID     <chr> "Alanine ", "Glycine ", "Valine", "Isoleucine", "Threonine…
## $ Ret_Time    <dbl> 1.657, 2.765, 3.132, 3.332, 3.865, 4.698, 5.232, 5.465, 6.…
## $ Height      <dbl> 814.821, 196.091, 399.000, 17278.215, 4804.503, 1984.461, …
## $ Area        <dbl> 34166.250, 5976.300, 6935.500, 1101155.625, 146540.531, 64…
## $ Conc_ug_10g <dbl> 1.0107, 8.8758, 11.9590, 0.7854, 6.3660, 219.8710, 30.0620…
print(SPS)
##    Peak_No         Peak_ID Ret_Time    Height        Area Conc_ug_10g
## 1        1        Alanine     1.657   814.821   34166.250      1.0107
## 2        2        Glycine     2.765   196.091    5976.300      8.8758
## 3        3          Valine    3.132   399.000    6935.500     11.9590
## 4        4      Isoleucine    3.332 17278.215 1101155.625      0.7854
## 5        5       Threonine    3.865  4804.503  146540.531      6.3660
## 6        6   Glutamic acid    4.698  1984.461   64636.328    219.8710
## 7        7      Methionine    5.232  1735.419   82116.672     30.0620
## 8        8 Hydroxylproline    5.465  1330.485   36460.293     12.5600
## 9        9        Cysteine    6.032   305.778    7825.513     13.0390
## 10      11      Ornithine     7.265   871.100   28979.699      2.0970

Import JPS Data

JPS <- read.csv(
                "C:/Users/anifo/OneDrive/Desktop/Thomas/JPS.csv",
                fileEncoding = "latin1",
                check.names = FALSE,
                stringsAsFactors = FALSE
)
names(JPS)
## [1] "Peak No."    "Peak ID"     "Ret Time"    "Height"      "Area"       
## [6] "Conc µg/10g"
names(JPS)[names(JPS) == "Peak No."] <- "Peak_No"
names(JPS)[names(JPS) == "Peak ID"] <- "Peak_ID"
names(JPS)[names(JPS) == "Ret Time"] <- "Ret_Time"
names(JPS)[names(JPS) == "Conc µg/10g"] <- "Conc_ug_10g"

JPS <- JPS[rowSums(is.na(JPS)) != ncol(JPS), ]
glimpse(JPS)
## Rows: 13
## Columns: 6
## $ Peak_No     <int> 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13
## $ Peak_ID     <chr> "Unidentified_1", "Alanine ", "Glycine ", "Leucine", "Hydr…
## $ Ret_Time    <dbl> 0.173, 1.190, 1.565, 2.015, 3.557, 3.865, 4.215, 4.698, 5.…
## $ Height      <dbl> 3190.00, 2353.62, 285391.88, 867249.94, 250170.44, 468698.…
## $ Area        <dbl> 38402.6, 8479.0, 3862585.5, 21675082.0, 6458566.5, 7591787…
## $ Conc_ug_10g <dbl> 0.0748, 0.0165, 7.5202, 42.1999, 12.5744, 14.7807, 7.1902,…
print(JPS)
##    Peak_No         Peak_ID Ret_Time    Height       Area Conc_ug_10g
## 1        1  Unidentified_1    0.173   3190.00    38402.6      0.0748
## 2        1        Alanine     1.190   2353.62     8479.0      0.0165
## 3        2        Glycine     1.565 285391.88  3862585.5      7.5202
## 4        3         Leucine    2.015 867249.94 21675082.0     42.1999
## 5        4 Hydroxylproline    3.557 250170.44  6458566.5     12.5744
## 6        5   Phenylalanine    3.865 468698.12  7591787.5     14.7807
## 7        6          Lysine    4.215 211793.64  3693104.8      7.1902
## 8        7       Histidine    4.698 157652.59  2497412.2      4.8623
## 9        8        Cysteine    5.048  78591.11  2117121.0      4.1219
## 10       9   Glutamic acid    5.798  62464.93  1073757.1      2.0905
## 11      11        Tyrosine    5.982 125642.16  1709865.9      3.3290
## 12      12      Tryptophan    6.548  41483.75   521804.7      1.0159
## 13      13  Unidentified_2    6.998  10148.85   114876.2      0.2237

Import AJS Data

APS <- read.csv(
                "C:/Users/anifo/OneDrive/Desktop/Thomas/APS.csv",
                fileEncoding = "latin1",
                check.names = FALSE,
                stringsAsFactors = FALSE
)
names(APS)
## [1] "Peak No."    "Peak ID"     "Ret Time"    "Height"      "Area"       
## [6] "Conc µg/10g"
names(APS)[names(APS) == "Peak No."] <- "Peak_No"
names(APS)[names(APS) == "Peak ID"] <- "Peak_ID"
names(APS)[names(APS) == "Ret Time"] <- "Ret_Time"
names(APS)[names(APS) == "Conc µg/10g"] <- "Conc_ug_10g"

APS <- APS[rowSums(is.na(APS)) != ncol(APS), ]
print(APS)
##   Peak_No         Peak_ID Ret_Time     Height       Area Conc_ug_10g
## 1       1  Unidentified_1    0.523   109.9470    505.992      0.0372
## 2       2        Alanine     1.523   831.8250  51059.988      3.8168
## 3       3         Serine     2.407  1159.2340  67851.953      5.0728
## 4       4         Lysine     4.157    92.8062   3091.650      0.2316
## 5       5      Ornithine     5.790   366.0000  12153.000      0.9086
## 6       6       Tyrosine     5.973 19871.8980 995638.125    174.4290
## 7       7     Tryptophan     6.890  3002.6240  88647.711      6.6253
## 8       8 Unidentified_2     8.032  1236.5600  63113.801     14.7174
## 9       9  Unidentified_3   11.098  1305.0390  55702.699      4.1633

Import COPS Data

COPS <- read.csv(
                "C:/Users/anifo/OneDrive/Desktop/Thomas/COPS.csv",
                fileEncoding = "latin1",
                check.names = FALSE,
                stringsAsFactors = FALSE
)
names(COPS)
## [1] "Peak No."    "Peak ID"     "Ret Time"    "Height"      "Area"       
## [6] "Conc µg/10g"
names(COPS)[names(COPS) == "Peak No."] <- "Peak_No"
names(COPS)[names(COPS) == "Peak ID"] <- "Peak_ID"
names(COPS)[names(COPS) == "Ret Time"] <- "Ret_Time"
names(COPS)[names(COPS) == "Conc µg/10g"] <- "Conc_ug_10g"

COPS <- COPS[rowSums(is.na(APS)) != ncol(COPS), ]
print(COPS)
##    Peak_No        Peak_ID Ret_Time    Height       Area Conc_ug_10g
## 1        1       Alanine     1.557  2153.328  42436.000      5.8902
## 2        2       Glycine     1.940  2962.508  40132.484      8.1036
## 3        3        Leucine    2.323 23398.689 605457.563     64.0041
## 4        4     Isoleucine    2.848  7967.754 210798.063     21.7948
## 5        5     Methionine    3.865    30.230   3420.905      0.7575
## 6        6 Unidentified_1    3.482    46.000    244.700      0.1765
## 7        7      Glutamine    3.948    20.846    208.900      0.0455
## 8        8         Lysine    4.215     6.818    204.400      0.0556
## 9        9      Histidine    4.748    52.333    582.653      0.0050
## 10      10 Unidentified_2    5.232    67.196   3152.649      0.0240
## 11      11 Unidentified_3    5.732    13.750    138.300      0.0036

Add all the data together

SPS$Group  <- "SPS"
JPS$Group  <- "JPS"
APS$Group  <- "APS"
COPS$Group <- "COPS"

all_data <- bind_rows(SPS, JPS, APS, COPS)

print(all_data)
##    Peak_No         Peak_ID Ret_Time      Height         Area Conc_ug_10g Group
## 1        1        Alanine     1.657    814.8210    34166.250      1.0107   SPS
## 2        2        Glycine     2.765    196.0910     5976.300      8.8758   SPS
## 3        3          Valine    3.132    399.0000     6935.500     11.9590   SPS
## 4        4      Isoleucine    3.332  17278.2150  1101155.625      0.7854   SPS
## 5        5       Threonine    3.865   4804.5030   146540.531      6.3660   SPS
## 6        6   Glutamic acid    4.698   1984.4610    64636.328    219.8710   SPS
## 7        7      Methionine    5.232   1735.4190    82116.672     30.0620   SPS
## 8        8 Hydroxylproline    5.465   1330.4850    36460.293     12.5600   SPS
## 9        9        Cysteine    6.032    305.7780     7825.513     13.0390   SPS
## 10      11      Ornithine     7.265    871.1000    28979.699      2.0970   SPS
## 11       1  Unidentified_1    0.173   3190.0000    38402.602      0.0748   JPS
## 12       1        Alanine     1.190   2353.6200     8479.000      0.0165   JPS
## 13       2        Glycine     1.565 285391.8750  3862585.500      7.5202   JPS
## 14       3         Leucine    2.015 867249.9380 21675082.000     42.1999   JPS
## 15       4 Hydroxylproline    3.557 250170.4380  6458566.500     12.5744   JPS
## 16       5   Phenylalanine    3.865 468698.1250  7591787.500     14.7807   JPS
## 17       6          Lysine    4.215 211793.6410  3693104.750      7.1902   JPS
## 18       7       Histidine    4.698 157652.5940  2497412.250      4.8623   JPS
## 19       8        Cysteine    5.048  78591.1090  2117121.000      4.1219   JPS
## 20       9   Glutamic acid    5.798  62464.9340  1073757.125      2.0905   JPS
## 21      11        Tyrosine    5.982 125642.1560  1709865.875      3.3290   JPS
## 22      12      Tryptophan    6.548  41483.7540   521804.656      1.0159   JPS
## 23      13  Unidentified_2    6.998  10148.8480   114876.164      0.2237   JPS
## 24       1  Unidentified_1    0.523    109.9470      505.992      0.0372   APS
## 25       2        Alanine     1.523    831.8250    51059.988      3.8168   APS
## 26       3         Serine     2.407   1159.2340    67851.953      5.0728   APS
## 27       4         Lysine     4.157     92.8062     3091.650      0.2316   APS
## 28       5      Ornithine     5.790    366.0000    12153.000      0.9086   APS
## 29       6       Tyrosine     5.973  19871.8980   995638.125    174.4290   APS
## 30       7     Tryptophan     6.890   3002.6240    88647.711      6.6253   APS
## 31       8 Unidentified_2     8.032   1236.5600    63113.801     14.7174   APS
## 32       9  Unidentified_3   11.098   1305.0390    55702.699      4.1633   APS
## 33       1        Alanine     1.557   2153.3280    42436.000      5.8902  COPS
## 34       2        Glycine     1.940   2962.5080    40132.484      8.1036  COPS
## 35       3         Leucine    2.323  23398.6890   605457.563     64.0041  COPS
## 36       4      Isoleucine    2.848   7967.7540   210798.063     21.7948  COPS
## 37       5      Methionine    3.865     30.2300     3420.905      0.7575  COPS
## 38       6  Unidentified_1    3.482     46.0000      244.700      0.1765  COPS
## 39       7       Glutamine    3.948     20.8460      208.900      0.0455  COPS
## 40       8          Lysine    4.215      6.8180      204.400      0.0556  COPS
## 41       9       Histidine    4.748     52.3330      582.653      0.0050  COPS
## 42      10  Unidentified_2    5.232     67.1960     3152.649      0.0240  COPS
## 43      11  Unidentified_3    5.732     13.7500      138.300      0.0036  COPS

Convert from pivot to wider data

all_data_clean <- all_data %>%
                mutate(
                                Peak_ID = str_trim(Peak_ID), # remove leading/trailing spaces
                                Peak_ID = str_squish(Peak_ID),     # remove double spaces
                                Peak_ID = iconv(Peak_ID, to = "UTF-8") # normalize encoding
                ) %>%
                group_by(Group, Peak_ID) %>%
                summarise(Conc_ug_10g = sum(Conc_ug_10g), .groups = "drop")

wide_data <- all_data_clean %>% 
                tidyr::pivot_wider(names_from = Peak_ID, 
                                   values_from = Conc_ug_10g, values_fill = 0 )
names(wide_data)
##  [1] "Group"           "Alanine"         "Lysine"          "Ornithine"      
##  [5] "Serine"          "Tryptophan"      "Tyrosine"        "Unidentified_1" 
##  [9] "Unidentified_2"  "Unidentified_3"  "Glutamine"       "Glycine"        
## [13] "Histidine"       "Isoleucine"      "Leucine"         "Methionine"     
## [17] "Cysteine"        "Glutamic acid"   "Hydroxylproline" "Phenylalanine"  
## [21] "Threonine"       "Valine"

Replace NA with 0

wide_data[is.na(wide_data)] <- 0
print(wide_data)
## # A tibble: 4 × 22
##   Group Alanine Lysine Ornithine Serine Tryptophan Tyrosine Unidentified_1
##   <chr>   <dbl>  <dbl>     <dbl>  <dbl>      <dbl>    <dbl>          <dbl>
## 1 APS    3.82   0.232      0.909   5.07       6.63   174.           0.0372
## 2 COPS   5.89   0.0556     0       0          0        0            0.176 
## 3 JPS    0.0165 7.19       0       0          1.02     3.33         0.0748
## 4 SPS    1.01   0          2.10    0          0        0            0     
## # ℹ 14 more variables: Unidentified_2 <dbl>, Unidentified_3 <dbl>,
## #   Glutamine <dbl>, Glycine <dbl>, Histidine <dbl>, Isoleucine <dbl>,
## #   Leucine <dbl>, Methionine <dbl>, Cysteine <dbl>, `Glutamic acid` <dbl>,
## #   Hydroxylproline <dbl>, Phenylalanine <dbl>, Threonine <dbl>, Valine <dbl>

Descriptive:

#total conc per group
all_data %>%
                group_by(Group) %>%
                summarise(total_conc = sum(Conc_ug_10g))
## # A tibble: 4 × 2
##   Group total_conc
##   <chr>      <dbl>
## 1 APS         210.
## 2 COPS        101.
## 3 JPS         100 
## 4 SPS         307.
#total compounds per group
all_data %>%
                group_by(Group, Peak_ID) %>%
                summarise(mean_conc = mean(Conc_ug_10g)) %>%
                arrange(Group, desc(mean_conc))
## # A tibble: 43 × 3
## # Groups:   Group [4]
##    Group Peak_ID           mean_conc
##    <chr> <chr>                 <dbl>
##  1 APS   "Tyrosine "        174.    
##  2 APS   "Unidentified_2 "   14.7   
##  3 APS   "Tryptophan "        6.63  
##  4 APS   "Serine "            5.07  
##  5 APS   "Unidentified_3"     4.16  
##  6 APS   "Alanine "           3.82  
##  7 APS   "Ornithine "         0.909 
##  8 APS   "Lysine "            0.232 
##  9 APS   "Unidentified_1"     0.0372
## 10 COPS  "Leucine"           64.0   
## # ℹ 33 more rows

Barplot

ggplot(all_data, aes(x = Peak_ID, y = Conc_ug_10g, fill = Group)) +
                geom_col(position = "dodge") +
                coord_flip() +
                theme_minimal()

PCA analysis

names(wide_data)
##  [1] "Group"           "Alanine"         "Lysine"          "Ornithine"      
##  [5] "Serine"          "Tryptophan"      "Tyrosine"        "Unidentified_1" 
##  [9] "Unidentified_2"  "Unidentified_3"  "Glutamine"       "Glycine"        
## [13] "Histidine"       "Isoleucine"      "Leucine"         "Methionine"     
## [17] "Cysteine"        "Glutamic acid"   "Hydroxylproline" "Phenylalanine"  
## [21] "Threonine"       "Valine"
#clean the data
# Ensure Group column exists and is lowercase
if ("Group" %in% names(all_data_clean)) {
  all_data_clean <- all_data_clean %>% rename(group = Group)
}

# Pivot to wide format
wide_data <- all_data_clean %>%
  pivot_wider(
    names_from = Peak_ID,
    values_from = Conc_ug_10g,
    values_fill = 0
  ) %>%
  clean_names()

# Move group to rownames if present
if ("group" %in% names(wide_data)) {
  wide_data <- wide_data %>% column_to_rownames("group")
}

groups <- factor(rownames(wide_data))
pca_res <- PCA(wide_data, graph = FALSE)

fviz_pca_ind(
                pca_res,
                habillage = groups,
                geom = "point"
)

#This tells you why APS, COPS, JPS, and SPS separate in PCA space.
fviz_pca_var(pca_res)

Heatmap

#Heatmap: Visualise patterns across all amino acids
pheatmap( wide_data, 
          cluster_rows = TRUE, 
          cluster_cols = TRUE, scale = "row" )

Statistics

#Group-wise summaries (means, totals, profiles)
all_data %>%
                group_by(Group, Peak_ID) %>%
                summarise(mean_conc = mean(Conc_ug_10g)) %>%
                arrange(Group, desc(mean_conc))
## # A tibble: 43 × 3
## # Groups:   Group [4]
##    Group Peak_ID           mean_conc
##    <chr> <chr>                 <dbl>
##  1 APS   "Tyrosine "        174.    
##  2 APS   "Unidentified_2 "   14.7   
##  3 APS   "Tryptophan "        6.63  
##  4 APS   "Serine "            5.07  
##  5 APS   "Unidentified_3"     4.16  
##  6 APS   "Alanine "           3.82  
##  7 APS   "Ornithine "         0.909 
##  8 APS   "Lysine "            0.232 
##  9 APS   "Unidentified_1"     0.0372
## 10 COPS  "Leucine"           64.0   
## # ℹ 33 more rows
#HCA- Hierachical clustering
dist_matrix <- dist(wide_data)
hc <- hclust(dist_matrix)
plot(hc)

## Correlation analysis: to explore relationships between compounds
cor_matrix <- cor(wide_data)
pheatmap(cor_matrix)