Load the necessary packages
library(tidyverse)
library(dplyr)
library(janitor)
library(FactoMineR)
library(factoextra)
library(pheatmap)
library(stringr)
Import SPS Data
SPS <- read.csv(
"C:/Users/anifo/OneDrive/Desktop/Thomas/SPS.csv",
fileEncoding = "latin1",
check.names = FALSE,
stringsAsFactors = FALSE
)
names(SPS)
## [1] "Peak No." "Peak ID" "Ret Time" "Height" "Area"
## [6] "Conc µg/10g"
names(SPS)[names(SPS) == "Peak No."] <- "Peak_No"
names(SPS)[names(SPS) == "Peak ID"] <- "Peak_ID"
names(SPS)[names(SPS) == "Ret Time"] <- "Ret_Time"
names(SPS)[names(SPS) == "Conc µg/10g"] <- "Conc_ug_10g"
SPS <- SPS[rowSums(is.na(SPS)) != ncol(SPS), ]
names(SPS)
## [1] "Peak_No" "Peak_ID" "Ret_Time" "Height" "Area"
## [6] "Conc_ug_10g"
glimpse(SPS)
## Rows: 10
## Columns: 6
## $ Peak_No <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 11
## $ Peak_ID <chr> "Alanine ", "Glycine ", "Valine", "Isoleucine", "Threonine…
## $ Ret_Time <dbl> 1.657, 2.765, 3.132, 3.332, 3.865, 4.698, 5.232, 5.465, 6.…
## $ Height <dbl> 814.821, 196.091, 399.000, 17278.215, 4804.503, 1984.461, …
## $ Area <dbl> 34166.250, 5976.300, 6935.500, 1101155.625, 146540.531, 64…
## $ Conc_ug_10g <dbl> 1.0107, 8.8758, 11.9590, 0.7854, 6.3660, 219.8710, 30.0620…
print(SPS)
## Peak_No Peak_ID Ret_Time Height Area Conc_ug_10g
## 1 1 Alanine 1.657 814.821 34166.250 1.0107
## 2 2 Glycine 2.765 196.091 5976.300 8.8758
## 3 3 Valine 3.132 399.000 6935.500 11.9590
## 4 4 Isoleucine 3.332 17278.215 1101155.625 0.7854
## 5 5 Threonine 3.865 4804.503 146540.531 6.3660
## 6 6 Glutamic acid 4.698 1984.461 64636.328 219.8710
## 7 7 Methionine 5.232 1735.419 82116.672 30.0620
## 8 8 Hydroxylproline 5.465 1330.485 36460.293 12.5600
## 9 9 Cysteine 6.032 305.778 7825.513 13.0390
## 10 11 Ornithine 7.265 871.100 28979.699 2.0970
Import JPS Data
JPS <- read.csv(
"C:/Users/anifo/OneDrive/Desktop/Thomas/JPS.csv",
fileEncoding = "latin1",
check.names = FALSE,
stringsAsFactors = FALSE
)
names(JPS)
## [1] "Peak No." "Peak ID" "Ret Time" "Height" "Area"
## [6] "Conc µg/10g"
names(JPS)[names(JPS) == "Peak No."] <- "Peak_No"
names(JPS)[names(JPS) == "Peak ID"] <- "Peak_ID"
names(JPS)[names(JPS) == "Ret Time"] <- "Ret_Time"
names(JPS)[names(JPS) == "Conc µg/10g"] <- "Conc_ug_10g"
JPS <- JPS[rowSums(is.na(JPS)) != ncol(JPS), ]
glimpse(JPS)
## Rows: 13
## Columns: 6
## $ Peak_No <int> 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13
## $ Peak_ID <chr> "Unidentified_1", "Alanine ", "Glycine ", "Leucine", "Hydr…
## $ Ret_Time <dbl> 0.173, 1.190, 1.565, 2.015, 3.557, 3.865, 4.215, 4.698, 5.…
## $ Height <dbl> 3190.00, 2353.62, 285391.88, 867249.94, 250170.44, 468698.…
## $ Area <dbl> 38402.6, 8479.0, 3862585.5, 21675082.0, 6458566.5, 7591787…
## $ Conc_ug_10g <dbl> 0.0748, 0.0165, 7.5202, 42.1999, 12.5744, 14.7807, 7.1902,…
print(JPS)
## Peak_No Peak_ID Ret_Time Height Area Conc_ug_10g
## 1 1 Unidentified_1 0.173 3190.00 38402.6 0.0748
## 2 1 Alanine 1.190 2353.62 8479.0 0.0165
## 3 2 Glycine 1.565 285391.88 3862585.5 7.5202
## 4 3 Leucine 2.015 867249.94 21675082.0 42.1999
## 5 4 Hydroxylproline 3.557 250170.44 6458566.5 12.5744
## 6 5 Phenylalanine 3.865 468698.12 7591787.5 14.7807
## 7 6 Lysine 4.215 211793.64 3693104.8 7.1902
## 8 7 Histidine 4.698 157652.59 2497412.2 4.8623
## 9 8 Cysteine 5.048 78591.11 2117121.0 4.1219
## 10 9 Glutamic acid 5.798 62464.93 1073757.1 2.0905
## 11 11 Tyrosine 5.982 125642.16 1709865.9 3.3290
## 12 12 Tryptophan 6.548 41483.75 521804.7 1.0159
## 13 13 Unidentified_2 6.998 10148.85 114876.2 0.2237
Import AJS Data
APS <- read.csv(
"C:/Users/anifo/OneDrive/Desktop/Thomas/APS.csv",
fileEncoding = "latin1",
check.names = FALSE,
stringsAsFactors = FALSE
)
names(APS)
## [1] "Peak No." "Peak ID" "Ret Time" "Height" "Area"
## [6] "Conc µg/10g"
names(APS)[names(APS) == "Peak No."] <- "Peak_No"
names(APS)[names(APS) == "Peak ID"] <- "Peak_ID"
names(APS)[names(APS) == "Ret Time"] <- "Ret_Time"
names(APS)[names(APS) == "Conc µg/10g"] <- "Conc_ug_10g"
APS <- APS[rowSums(is.na(APS)) != ncol(APS), ]
print(APS)
## Peak_No Peak_ID Ret_Time Height Area Conc_ug_10g
## 1 1 Unidentified_1 0.523 109.9470 505.992 0.0372
## 2 2 Alanine 1.523 831.8250 51059.988 3.8168
## 3 3 Serine 2.407 1159.2340 67851.953 5.0728
## 4 4 Lysine 4.157 92.8062 3091.650 0.2316
## 5 5 Ornithine 5.790 366.0000 12153.000 0.9086
## 6 6 Tyrosine 5.973 19871.8980 995638.125 174.4290
## 7 7 Tryptophan 6.890 3002.6240 88647.711 6.6253
## 8 8 Unidentified_2 8.032 1236.5600 63113.801 14.7174
## 9 9 Unidentified_3 11.098 1305.0390 55702.699 4.1633
Import COPS Data
COPS <- read.csv(
"C:/Users/anifo/OneDrive/Desktop/Thomas/COPS.csv",
fileEncoding = "latin1",
check.names = FALSE,
stringsAsFactors = FALSE
)
names(COPS)
## [1] "Peak No." "Peak ID" "Ret Time" "Height" "Area"
## [6] "Conc µg/10g"
names(COPS)[names(COPS) == "Peak No."] <- "Peak_No"
names(COPS)[names(COPS) == "Peak ID"] <- "Peak_ID"
names(COPS)[names(COPS) == "Ret Time"] <- "Ret_Time"
names(COPS)[names(COPS) == "Conc µg/10g"] <- "Conc_ug_10g"
COPS <- COPS[rowSums(is.na(APS)) != ncol(COPS), ]
print(COPS)
## Peak_No Peak_ID Ret_Time Height Area Conc_ug_10g
## 1 1 Alanine 1.557 2153.328 42436.000 5.8902
## 2 2 Glycine 1.940 2962.508 40132.484 8.1036
## 3 3 Leucine 2.323 23398.689 605457.563 64.0041
## 4 4 Isoleucine 2.848 7967.754 210798.063 21.7948
## 5 5 Methionine 3.865 30.230 3420.905 0.7575
## 6 6 Unidentified_1 3.482 46.000 244.700 0.1765
## 7 7 Glutamine 3.948 20.846 208.900 0.0455
## 8 8 Lysine 4.215 6.818 204.400 0.0556
## 9 9 Histidine 4.748 52.333 582.653 0.0050
## 10 10 Unidentified_2 5.232 67.196 3152.649 0.0240
## 11 11 Unidentified_3 5.732 13.750 138.300 0.0036
Add all the data together
SPS$Group <- "SPS"
JPS$Group <- "JPS"
APS$Group <- "APS"
COPS$Group <- "COPS"
all_data <- bind_rows(SPS, JPS, APS, COPS)
print(all_data)
## Peak_No Peak_ID Ret_Time Height Area Conc_ug_10g Group
## 1 1 Alanine 1.657 814.8210 34166.250 1.0107 SPS
## 2 2 Glycine 2.765 196.0910 5976.300 8.8758 SPS
## 3 3 Valine 3.132 399.0000 6935.500 11.9590 SPS
## 4 4 Isoleucine 3.332 17278.2150 1101155.625 0.7854 SPS
## 5 5 Threonine 3.865 4804.5030 146540.531 6.3660 SPS
## 6 6 Glutamic acid 4.698 1984.4610 64636.328 219.8710 SPS
## 7 7 Methionine 5.232 1735.4190 82116.672 30.0620 SPS
## 8 8 Hydroxylproline 5.465 1330.4850 36460.293 12.5600 SPS
## 9 9 Cysteine 6.032 305.7780 7825.513 13.0390 SPS
## 10 11 Ornithine 7.265 871.1000 28979.699 2.0970 SPS
## 11 1 Unidentified_1 0.173 3190.0000 38402.602 0.0748 JPS
## 12 1 Alanine 1.190 2353.6200 8479.000 0.0165 JPS
## 13 2 Glycine 1.565 285391.8750 3862585.500 7.5202 JPS
## 14 3 Leucine 2.015 867249.9380 21675082.000 42.1999 JPS
## 15 4 Hydroxylproline 3.557 250170.4380 6458566.500 12.5744 JPS
## 16 5 Phenylalanine 3.865 468698.1250 7591787.500 14.7807 JPS
## 17 6 Lysine 4.215 211793.6410 3693104.750 7.1902 JPS
## 18 7 Histidine 4.698 157652.5940 2497412.250 4.8623 JPS
## 19 8 Cysteine 5.048 78591.1090 2117121.000 4.1219 JPS
## 20 9 Glutamic acid 5.798 62464.9340 1073757.125 2.0905 JPS
## 21 11 Tyrosine 5.982 125642.1560 1709865.875 3.3290 JPS
## 22 12 Tryptophan 6.548 41483.7540 521804.656 1.0159 JPS
## 23 13 Unidentified_2 6.998 10148.8480 114876.164 0.2237 JPS
## 24 1 Unidentified_1 0.523 109.9470 505.992 0.0372 APS
## 25 2 Alanine 1.523 831.8250 51059.988 3.8168 APS
## 26 3 Serine 2.407 1159.2340 67851.953 5.0728 APS
## 27 4 Lysine 4.157 92.8062 3091.650 0.2316 APS
## 28 5 Ornithine 5.790 366.0000 12153.000 0.9086 APS
## 29 6 Tyrosine 5.973 19871.8980 995638.125 174.4290 APS
## 30 7 Tryptophan 6.890 3002.6240 88647.711 6.6253 APS
## 31 8 Unidentified_2 8.032 1236.5600 63113.801 14.7174 APS
## 32 9 Unidentified_3 11.098 1305.0390 55702.699 4.1633 APS
## 33 1 Alanine 1.557 2153.3280 42436.000 5.8902 COPS
## 34 2 Glycine 1.940 2962.5080 40132.484 8.1036 COPS
## 35 3 Leucine 2.323 23398.6890 605457.563 64.0041 COPS
## 36 4 Isoleucine 2.848 7967.7540 210798.063 21.7948 COPS
## 37 5 Methionine 3.865 30.2300 3420.905 0.7575 COPS
## 38 6 Unidentified_1 3.482 46.0000 244.700 0.1765 COPS
## 39 7 Glutamine 3.948 20.8460 208.900 0.0455 COPS
## 40 8 Lysine 4.215 6.8180 204.400 0.0556 COPS
## 41 9 Histidine 4.748 52.3330 582.653 0.0050 COPS
## 42 10 Unidentified_2 5.232 67.1960 3152.649 0.0240 COPS
## 43 11 Unidentified_3 5.732 13.7500 138.300 0.0036 COPS
Convert from pivot to wider data
all_data_clean <- all_data %>%
mutate(
Peak_ID = str_trim(Peak_ID), # remove leading/trailing spaces
Peak_ID = str_squish(Peak_ID), # remove double spaces
Peak_ID = iconv(Peak_ID, to = "UTF-8") # normalize encoding
) %>%
group_by(Group, Peak_ID) %>%
summarise(Conc_ug_10g = sum(Conc_ug_10g), .groups = "drop")
wide_data <- all_data_clean %>%
tidyr::pivot_wider(names_from = Peak_ID,
values_from = Conc_ug_10g, values_fill = 0 )
names(wide_data)
## [1] "Group" "Alanine" "Lysine" "Ornithine"
## [5] "Serine" "Tryptophan" "Tyrosine" "Unidentified_1"
## [9] "Unidentified_2" "Unidentified_3" "Glutamine" "Glycine"
## [13] "Histidine" "Isoleucine" "Leucine" "Methionine"
## [17] "Cysteine" "Glutamic acid" "Hydroxylproline" "Phenylalanine"
## [21] "Threonine" "Valine"
Replace NA with 0
wide_data[is.na(wide_data)] <- 0
print(wide_data)
## # A tibble: 4 × 22
## Group Alanine Lysine Ornithine Serine Tryptophan Tyrosine Unidentified_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 APS 3.82 0.232 0.909 5.07 6.63 174. 0.0372
## 2 COPS 5.89 0.0556 0 0 0 0 0.176
## 3 JPS 0.0165 7.19 0 0 1.02 3.33 0.0748
## 4 SPS 1.01 0 2.10 0 0 0 0
## # ℹ 14 more variables: Unidentified_2 <dbl>, Unidentified_3 <dbl>,
## # Glutamine <dbl>, Glycine <dbl>, Histidine <dbl>, Isoleucine <dbl>,
## # Leucine <dbl>, Methionine <dbl>, Cysteine <dbl>, `Glutamic acid` <dbl>,
## # Hydroxylproline <dbl>, Phenylalanine <dbl>, Threonine <dbl>, Valine <dbl>
Descriptive:
#total conc per group
all_data %>%
group_by(Group) %>%
summarise(total_conc = sum(Conc_ug_10g))
## # A tibble: 4 × 2
## Group total_conc
## <chr> <dbl>
## 1 APS 210.
## 2 COPS 101.
## 3 JPS 100
## 4 SPS 307.
#total compounds per group
all_data %>%
group_by(Group, Peak_ID) %>%
summarise(mean_conc = mean(Conc_ug_10g)) %>%
arrange(Group, desc(mean_conc))
## # A tibble: 43 × 3
## # Groups: Group [4]
## Group Peak_ID mean_conc
## <chr> <chr> <dbl>
## 1 APS "Tyrosine " 174.
## 2 APS "Unidentified_2 " 14.7
## 3 APS "Tryptophan " 6.63
## 4 APS "Serine " 5.07
## 5 APS "Unidentified_3" 4.16
## 6 APS "Alanine " 3.82
## 7 APS "Ornithine " 0.909
## 8 APS "Lysine " 0.232
## 9 APS "Unidentified_1" 0.0372
## 10 COPS "Leucine" 64.0
## # ℹ 33 more rows
Barplot
ggplot(all_data, aes(x = Peak_ID, y = Conc_ug_10g, fill = Group)) +
geom_col(position = "dodge") +
coord_flip() +
theme_minimal()

PCA analysis
names(wide_data)
## [1] "Group" "Alanine" "Lysine" "Ornithine"
## [5] "Serine" "Tryptophan" "Tyrosine" "Unidentified_1"
## [9] "Unidentified_2" "Unidentified_3" "Glutamine" "Glycine"
## [13] "Histidine" "Isoleucine" "Leucine" "Methionine"
## [17] "Cysteine" "Glutamic acid" "Hydroxylproline" "Phenylalanine"
## [21] "Threonine" "Valine"
#clean the data
# Ensure Group column exists and is lowercase
if ("Group" %in% names(all_data_clean)) {
all_data_clean <- all_data_clean %>% rename(group = Group)
}
# Pivot to wide format
wide_data <- all_data_clean %>%
pivot_wider(
names_from = Peak_ID,
values_from = Conc_ug_10g,
values_fill = 0
) %>%
clean_names()
# Move group to rownames if present
if ("group" %in% names(wide_data)) {
wide_data <- wide_data %>% column_to_rownames("group")
}
groups <- factor(rownames(wide_data))
pca_res <- PCA(wide_data, graph = FALSE)
fviz_pca_ind(
pca_res,
habillage = groups,
geom = "point"
)

#This tells you why APS, COPS, JPS, and SPS separate in PCA space.
fviz_pca_var(pca_res)

Heatmap
#Heatmap: Visualise patterns across all amino acids
pheatmap( wide_data,
cluster_rows = TRUE,
cluster_cols = TRUE, scale = "row" )

Statistics
#Group-wise summaries (means, totals, profiles)
all_data %>%
group_by(Group, Peak_ID) %>%
summarise(mean_conc = mean(Conc_ug_10g)) %>%
arrange(Group, desc(mean_conc))
## # A tibble: 43 × 3
## # Groups: Group [4]
## Group Peak_ID mean_conc
## <chr> <chr> <dbl>
## 1 APS "Tyrosine " 174.
## 2 APS "Unidentified_2 " 14.7
## 3 APS "Tryptophan " 6.63
## 4 APS "Serine " 5.07
## 5 APS "Unidentified_3" 4.16
## 6 APS "Alanine " 3.82
## 7 APS "Ornithine " 0.909
## 8 APS "Lysine " 0.232
## 9 APS "Unidentified_1" 0.0372
## 10 COPS "Leucine" 64.0
## # ℹ 33 more rows
#HCA- Hierachical clustering
dist_matrix <- dist(wide_data)
hc <- hclust(dist_matrix)
plot(hc)

## Correlation analysis: to explore relationships between compounds
cor_matrix <- cor(wide_data)
pheatmap(cor_matrix)
