# NEEDED PACKAGES:

suppressPackageStartupMessages({

library(ggplot2)
library(car)
library(dplyr)
library(tidyverse)
library(dendextend)
library(pheatmap)
})
# 1. Prepare the data set

set.seed(123)

cytokine_df <- data.frame(
  Group = rep(c("Control", "Sepsis"), each = 30),
  IL6 = c(rnorm(30, 5, 1), rnorm(30, 15, 5)),
  TNFa = c(rnorm(30, 10, 2), rnorm(30, 20, 4)),
  IL10 = c(rnorm(30, 8, 2), rnorm(30, 12, 3)),
  IFNg = c(rnorm(30, 7, 1.5), rnorm(30, 14, 3)),
  IL1b = c(rnorm(30, 6, 1), rnorm(30, 18, 4))
)

head(cytokine_df)
##     Group      IL6      TNFa      IL10     IFNg     IL1b
## 1 Control 4.439524 10.759279  8.235293 5.405011 5.211378
## 2 Control 4.769823  8.995353  6.105051 8.894778 5.497801
## 3 Control 6.558708  9.333585  7.018885 6.475524 7.496061
## 4 Control 5.070508  7.962849  7.487816 5.701731 4.862696
## 5 Control 5.129288  7.856418 11.687724 6.645581 5.820948
## 6 Control 6.715065 10.607057  6.696100 6.704236 7.902362
write.csv(cytokine_df, "cytokine_data.csv", row.names = FALSE)
# 2. Clean the data set

## Remove non-numeric columns to create an analyte only data set

analyte_df <- cytokine_df[sapply(cytokine_df, is.numeric)]
head(analyte_df)
##        IL6      TNFa      IL10     IFNg     IL1b
## 1 4.439524 10.759279  8.235293 5.405011 5.211378
## 2 4.769823  8.995353  6.105051 8.894778 5.497801
## 3 6.558708  9.333585  7.018885 6.475524 7.496061
## 4 5.070508  7.962849  7.487816 5.701731 4.862696
## 5 5.129288  7.856418 11.687724 6.645581 5.820948
## 6 6.715065 10.607057  6.696100 6.704236 7.902362
group <- cytokine_df$Group
group
##  [1] "Control" "Control" "Control" "Control" "Control" "Control" "Control"
##  [8] "Control" "Control" "Control" "Control" "Control" "Control" "Control"
## [15] "Control" "Control" "Control" "Control" "Control" "Control" "Control"
## [22] "Control" "Control" "Control" "Control" "Control" "Control" "Control"
## [29] "Control" "Control" "Sepsis"  "Sepsis"  "Sepsis"  "Sepsis"  "Sepsis" 
## [36] "Sepsis"  "Sepsis"  "Sepsis"  "Sepsis"  "Sepsis"  "Sepsis"  "Sepsis" 
## [43] "Sepsis"  "Sepsis"  "Sepsis"  "Sepsis"  "Sepsis"  "Sepsis"  "Sepsis" 
## [50] "Sepsis"  "Sepsis"  "Sepsis"  "Sepsis"  "Sepsis"  "Sepsis"  "Sepsis" 
## [57] "Sepsis"  "Sepsis"  "Sepsis"  "Sepsis"
## Check for missing values

sum(is.na(analyte_df))
## [1] 0
analyte_df <- na.omit(analyte_df)
nrow(analyte_df)
## [1] 60
# Scale the data

analyte_df <- scale(analyte_df)
head(analyte_df)
##          IL6       TNFa       IL10       IFNg       IL1b
## 1 -0.9523539 -0.7291846 -0.5136128 -1.2486242 -1.0407570
## 2 -0.8997762 -1.0446403 -1.1175311 -0.3721546 -1.0009437
## 3 -0.6150163 -0.9841518 -0.8584614 -0.9797602 -0.7231829
## 4 -0.8519121 -1.2292905 -0.7255208 -1.1741017 -1.0892242
## 5 -0.8425555 -1.2483244  0.4651426 -0.9370498 -0.9560258
## 6 -0.5901270 -0.7564075 -0.9499701 -0.9223183 -0.6667065
# 3. Run PCA

pca_cytokines <- prcomp(analyte_df) # don't scale and center again as we have already done that
summary(pca_cytokines)
## Importance of components:
##                          PC1     PC2     PC3     PC4     PC5
## Standard deviation     1.965 0.70292 0.53269 0.45943 0.38853
## Proportion of Variance 0.772 0.09882 0.05675 0.04221 0.03019
## Cumulative Proportion  0.772 0.87084 0.92759 0.96981 1.00000
# Calculate the percent variance explained

percent_var <- round(100 * pca_cytokines$sdev^2 / sum(pca_cytokines$sdev^2), 1)
percent_var # this can just be accessed in summary though
## [1] 77.2  9.9  5.7  4.2  3.0
# Make the scree plot to determine which components (PCs) matter

scree_df <- data.frame(
  PC = 1:length(percent_var),
  variance = percent_var
)
head(scree_df)
##   PC variance
## 1  1     77.2
## 2  2      9.9
## 3  3      5.7
## 4  4      4.2
## 5  5      3.0
ggplot(scree_df, aes(PC, variance))+
  geom_col(fill = scree_df$PC) +
  geom_line(colour = "red") +
  geom_point() +
  xlab("Principal Components") +
  ylab("% Variance Explained") +
  labs(title = "Significance of Components")

# Plot the PCA

plot(pca_cytokines$x[,1], pca_cytokines$x[,2],
     xlab = "PC1",
     ylab = "PC2",
     main = "PCA of Cytokines",
     col = as.factor(group),
     pch = 19)
legend("topright", legend = levels(as.factor(group)), col = 1:length(levels((as.factor(group)))), pch = 19)

pca_df <- data.frame(
  PC1 = pca_cytokines$x[,1],
  PC2 = pca_cytokines$x[,2],
  Group = as.factor(group))


ggplot(pca_df, aes(x = PC1, y = PC2, color = Group, fill = Group)) +
  geom_point(shape = 21, size = 3) +
  stat_ellipse(geom = "polygon", level = 0.95, alpha = 0.2) +
  labs(title = "PCA of Cytokines", x = "PC1", y = "PC2") +
  theme_minimal()

loadings <- as.data.frame(pca_cytokines$rotation)
loadings <- loadings %>% 
  arrange(desc(PC1)) %>% 
  mutate(cytokine = rownames(loadings)) %>% 
  relocate(cytokine)
  
loadings
##      cytokine       PC1        PC2           PC3         PC4         PC5
## IL6       IL6 0.4648631  0.0805039  0.1238484145  0.84812576  0.20679859
## IL1b     TNFa 0.4603097 -0.3976752  0.0004816833 -0.38399462  0.69463463
## TNFa     IL10 0.4553753 -0.3074127  0.5871541113 -0.16704804 -0.57050487
## IFNg     IFNg 0.4526453 -0.1214615 -0.7951997361 -0.02686926 -0.38379003
## IL10     IL1b 0.3997035  0.8521256  0.0869986643 -0.32342514  0.04411879
# Tells us the "loadings", which cytokines contribute most to the each PC

# Plot the loadings of PC1

ggplot(loadings, aes(x = cytokine, y = PC1, color = cytokine, fill = cytokine))+
  geom_col()+
  labs(x = "Cytokine", y = "PC1", title = "Loadings")+
  theme_classic()

# 4. Perform Hierarchical Clustering

# Make the distance matrix

dist_matrix <- dist(analyte_df)
head(dist_matrix)
## [1] 1.1121053 0.6861972 0.5594659 1.1592161 0.7541528 0.8431773
# Make the tree
h_cluster <- hclust(dist_matrix)
h_cluster <- color_labels(h_cluster, k=2) # start k as number of groups ("Sepsis", "Control")
## Loading required namespace: colorspace
par(mar = c(8,4,4,2))
plot(h_cluster,
     main = "Hierarchial Clustering of Cytokine Samples")

## Cut into clusters

clusters <-cutree(h_cluster, k = 2)

table(clusters, group) # Successfully clustered all samples separately based on experimental groups
##         group
## clusters Control Sepsis
##        1      30      0
##        2       0     30
# 5. Perform K Means Clustering (another method of clustering)

## Note: This is better utilized on larger data sets

set.seed(123)
k_cluster <- kmeans(dist_matrix, centers = 2) # number of classes
k_cluster
## K-means clustering with 2 clusters of sizes 30, 30
## 
## Cluster means:
##           1        2         3         4        5         6         7         8
## 1 4.1456929 4.102294 3.9569608 4.3698158 3.977639 3.8334736 3.6434949 4.0922622
## 2 0.8932881 0.986340 0.8521408 0.9141373 1.367423 0.8732141 0.9075954 0.8305362
##           9       10       11       12        13        14       15       16
## 1 4.0053785 3.786961 3.755491 4.361037 3.7723075 4.2747086 4.704717 3.227633
## 2 0.9402482 1.059811 1.176423 1.184465 0.8488293 0.8657326 1.339505 1.338603
##         17       18       19       20       21        22       23       24
## 1 4.137479 4.362290 3.783230 4.609115 3.672964 3.8000177 4.465083 4.097538
## 2 1.015126 1.155538 1.371041 1.084568 1.212955 0.9076474 1.027274 1.044676
##         25        26       27        28       29       30       31       32
## 1 4.324527 4.1056891 4.221151 3.9895775 3.682213 4.002900 1.673732 1.605345
## 2 1.072520 0.8623632 1.081756 0.9671548 1.566147 1.008725 4.699577 4.390105
##         33       34       35       36       37       38       39       40
## 1 1.787385 2.038402 1.891509 2.007597 2.131332 1.858451 1.766880 1.821908
## 2 4.834408 3.230841 4.483649 4.367354 4.895343 4.057235 3.678239 2.763987
##         41       42       43       44       45       46       47       48
## 1 2.087259 1.783975 2.254014 3.354479 1.763943 1.682166 1.608361 2.024667
## 2 3.159950 3.977701 3.706982 6.109713 3.783529 3.543581 3.390339 3.353934
##         49       50       51       52       53       54       55       56
## 1 1.667895 1.671109 2.100288 1.482968 1.782589 2.419323 1.860207 2.202776
## 2 4.332765 3.913583 4.747316 4.186458 3.718216 4.822860 4.130592 4.003353
##         57       58       59       60
## 1 2.047715 1.758468 1.513702 1.706593
## 2 3.958553 3.727874 3.763886 3.531719
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2 
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 
##  2  2  2  2  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 
## 53 54 55 56 57 58 59 60 
##  1  1  1  1  1  1  1  1 
## 
## Within cluster sum of squares by cluster:
## [1] 800.7523 278.4449
##  (between_SS / total_SS =  85.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
## Plot the clusters on the PCA

plot(pca_cytokines$x[,1], pca_cytokines$x[,2],
     col = k_cluster$cluster,
     pch = 19,
     main = "K-means Clustering on PCA",
     xlab = "PC1",
     ylab = "PC2")

# 6. Make a Heat Map

heatmap(analyte_df, scale = "none", Rowv = NA, Colv = NA)

heatmap(analyte_df, scale = "none") # With the dendogram 

## Make a heat map with pheatmap package

annotation <- data.frame("Treatment" = cytokine_df$Group)
rownames(annotation) <- rownames(cytokine_df)
head(annotation)
##   Treatment
## 1   Control
## 2   Control
## 3   Control
## 4   Control
## 5   Control
## 6   Control
ann_colors <- list(Group = c(Control = "blue", Sepsis = "red"))

pheatmap(analyte_df, scale = "none", annotation_row = annotation, annotation_colors = ann_colors, show_rownames = FALSE)