# NEEDED PACKAGES:
suppressPackageStartupMessages({
library(ggplot2)
library(car)
library(dplyr)
library(tidyverse)
library(dendextend)
library(pheatmap)
})
# 1. Prepare the data set
set.seed(123)
cytokine_df <- data.frame(
Group = rep(c("Control", "Sepsis"), each = 30),
IL6 = c(rnorm(30, 5, 1), rnorm(30, 15, 5)),
TNFa = c(rnorm(30, 10, 2), rnorm(30, 20, 4)),
IL10 = c(rnorm(30, 8, 2), rnorm(30, 12, 3)),
IFNg = c(rnorm(30, 7, 1.5), rnorm(30, 14, 3)),
IL1b = c(rnorm(30, 6, 1), rnorm(30, 18, 4))
)
head(cytokine_df)
## Group IL6 TNFa IL10 IFNg IL1b
## 1 Control 4.439524 10.759279 8.235293 5.405011 5.211378
## 2 Control 4.769823 8.995353 6.105051 8.894778 5.497801
## 3 Control 6.558708 9.333585 7.018885 6.475524 7.496061
## 4 Control 5.070508 7.962849 7.487816 5.701731 4.862696
## 5 Control 5.129288 7.856418 11.687724 6.645581 5.820948
## 6 Control 6.715065 10.607057 6.696100 6.704236 7.902362
write.csv(cytokine_df, "cytokine_data.csv", row.names = FALSE)
# 2. Clean the data set
## Remove non-numeric columns to create an analyte only data set
analyte_df <- cytokine_df[sapply(cytokine_df, is.numeric)]
head(analyte_df)
## IL6 TNFa IL10 IFNg IL1b
## 1 4.439524 10.759279 8.235293 5.405011 5.211378
## 2 4.769823 8.995353 6.105051 8.894778 5.497801
## 3 6.558708 9.333585 7.018885 6.475524 7.496061
## 4 5.070508 7.962849 7.487816 5.701731 4.862696
## 5 5.129288 7.856418 11.687724 6.645581 5.820948
## 6 6.715065 10.607057 6.696100 6.704236 7.902362
group <- cytokine_df$Group
group
## [1] "Control" "Control" "Control" "Control" "Control" "Control" "Control"
## [8] "Control" "Control" "Control" "Control" "Control" "Control" "Control"
## [15] "Control" "Control" "Control" "Control" "Control" "Control" "Control"
## [22] "Control" "Control" "Control" "Control" "Control" "Control" "Control"
## [29] "Control" "Control" "Sepsis" "Sepsis" "Sepsis" "Sepsis" "Sepsis"
## [36] "Sepsis" "Sepsis" "Sepsis" "Sepsis" "Sepsis" "Sepsis" "Sepsis"
## [43] "Sepsis" "Sepsis" "Sepsis" "Sepsis" "Sepsis" "Sepsis" "Sepsis"
## [50] "Sepsis" "Sepsis" "Sepsis" "Sepsis" "Sepsis" "Sepsis" "Sepsis"
## [57] "Sepsis" "Sepsis" "Sepsis" "Sepsis"
## Check for missing values
sum(is.na(analyte_df))
## [1] 0
analyte_df <- na.omit(analyte_df)
nrow(analyte_df)
## [1] 60
# Scale the data
analyte_df <- scale(analyte_df)
head(analyte_df)
## IL6 TNFa IL10 IFNg IL1b
## 1 -0.9523539 -0.7291846 -0.5136128 -1.2486242 -1.0407570
## 2 -0.8997762 -1.0446403 -1.1175311 -0.3721546 -1.0009437
## 3 -0.6150163 -0.9841518 -0.8584614 -0.9797602 -0.7231829
## 4 -0.8519121 -1.2292905 -0.7255208 -1.1741017 -1.0892242
## 5 -0.8425555 -1.2483244 0.4651426 -0.9370498 -0.9560258
## 6 -0.5901270 -0.7564075 -0.9499701 -0.9223183 -0.6667065
# 3. Run PCA
pca_cytokines <- prcomp(analyte_df) # don't scale and center again as we have already done that
summary(pca_cytokines)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.965 0.70292 0.53269 0.45943 0.38853
## Proportion of Variance 0.772 0.09882 0.05675 0.04221 0.03019
## Cumulative Proportion 0.772 0.87084 0.92759 0.96981 1.00000
# Calculate the percent variance explained
percent_var <- round(100 * pca_cytokines$sdev^2 / sum(pca_cytokines$sdev^2), 1)
percent_var # this can just be accessed in summary though
## [1] 77.2 9.9 5.7 4.2 3.0
# Make the scree plot to determine which components (PCs) matter
scree_df <- data.frame(
PC = 1:length(percent_var),
variance = percent_var
)
head(scree_df)
## PC variance
## 1 1 77.2
## 2 2 9.9
## 3 3 5.7
## 4 4 4.2
## 5 5 3.0
ggplot(scree_df, aes(PC, variance))+
geom_col(fill = scree_df$PC) +
geom_line(colour = "red") +
geom_point() +
xlab("Principal Components") +
ylab("% Variance Explained") +
labs(title = "Significance of Components")

# Plot the PCA
plot(pca_cytokines$x[,1], pca_cytokines$x[,2],
xlab = "PC1",
ylab = "PC2",
main = "PCA of Cytokines",
col = as.factor(group),
pch = 19)
legend("topright", legend = levels(as.factor(group)), col = 1:length(levels((as.factor(group)))), pch = 19)

pca_df <- data.frame(
PC1 = pca_cytokines$x[,1],
PC2 = pca_cytokines$x[,2],
Group = as.factor(group))
ggplot(pca_df, aes(x = PC1, y = PC2, color = Group, fill = Group)) +
geom_point(shape = 21, size = 3) +
stat_ellipse(geom = "polygon", level = 0.95, alpha = 0.2) +
labs(title = "PCA of Cytokines", x = "PC1", y = "PC2") +
theme_minimal()

loadings <- as.data.frame(pca_cytokines$rotation)
loadings <- loadings %>%
arrange(desc(PC1)) %>%
mutate(cytokine = rownames(loadings)) %>%
relocate(cytokine)
loadings
## cytokine PC1 PC2 PC3 PC4 PC5
## IL6 IL6 0.4648631 0.0805039 0.1238484145 0.84812576 0.20679859
## IL1b TNFa 0.4603097 -0.3976752 0.0004816833 -0.38399462 0.69463463
## TNFa IL10 0.4553753 -0.3074127 0.5871541113 -0.16704804 -0.57050487
## IFNg IFNg 0.4526453 -0.1214615 -0.7951997361 -0.02686926 -0.38379003
## IL10 IL1b 0.3997035 0.8521256 0.0869986643 -0.32342514 0.04411879
# Tells us the "loadings", which cytokines contribute most to the each PC
# Plot the loadings of PC1
ggplot(loadings, aes(x = cytokine, y = PC1, color = cytokine, fill = cytokine))+
geom_col()+
labs(x = "Cytokine", y = "PC1", title = "Loadings")+
theme_classic()

# 4. Perform Hierarchical Clustering
# Make the distance matrix
dist_matrix <- dist(analyte_df)
head(dist_matrix)
## [1] 1.1121053 0.6861972 0.5594659 1.1592161 0.7541528 0.8431773
# Make the tree
h_cluster <- hclust(dist_matrix)
h_cluster <- color_labels(h_cluster, k=2) # start k as number of groups ("Sepsis", "Control")
## Loading required namespace: colorspace
par(mar = c(8,4,4,2))
plot(h_cluster,
main = "Hierarchial Clustering of Cytokine Samples")

## Cut into clusters
clusters <-cutree(h_cluster, k = 2)
table(clusters, group) # Successfully clustered all samples separately based on experimental groups
## group
## clusters Control Sepsis
## 1 30 0
## 2 0 30
# 5. Perform K Means Clustering (another method of clustering)
## Note: This is better utilized on larger data sets
set.seed(123)
k_cluster <- kmeans(dist_matrix, centers = 2) # number of classes
k_cluster
## K-means clustering with 2 clusters of sizes 30, 30
##
## Cluster means:
## 1 2 3 4 5 6 7 8
## 1 4.1456929 4.102294 3.9569608 4.3698158 3.977639 3.8334736 3.6434949 4.0922622
## 2 0.8932881 0.986340 0.8521408 0.9141373 1.367423 0.8732141 0.9075954 0.8305362
## 9 10 11 12 13 14 15 16
## 1 4.0053785 3.786961 3.755491 4.361037 3.7723075 4.2747086 4.704717 3.227633
## 2 0.9402482 1.059811 1.176423 1.184465 0.8488293 0.8657326 1.339505 1.338603
## 17 18 19 20 21 22 23 24
## 1 4.137479 4.362290 3.783230 4.609115 3.672964 3.8000177 4.465083 4.097538
## 2 1.015126 1.155538 1.371041 1.084568 1.212955 0.9076474 1.027274 1.044676
## 25 26 27 28 29 30 31 32
## 1 4.324527 4.1056891 4.221151 3.9895775 3.682213 4.002900 1.673732 1.605345
## 2 1.072520 0.8623632 1.081756 0.9671548 1.566147 1.008725 4.699577 4.390105
## 33 34 35 36 37 38 39 40
## 1 1.787385 2.038402 1.891509 2.007597 2.131332 1.858451 1.766880 1.821908
## 2 4.834408 3.230841 4.483649 4.367354 4.895343 4.057235 3.678239 2.763987
## 41 42 43 44 45 46 47 48
## 1 2.087259 1.783975 2.254014 3.354479 1.763943 1.682166 1.608361 2.024667
## 2 3.159950 3.977701 3.706982 6.109713 3.783529 3.543581 3.390339 3.353934
## 49 50 51 52 53 54 55 56
## 1 1.667895 1.671109 2.100288 1.482968 1.782589 2.419323 1.860207 2.202776
## 2 4.332765 3.913583 4.747316 4.186458 3.718216 4.822860 4.130592 4.003353
## 57 58 59 60
## 1 2.047715 1.758468 1.513702 1.706593
## 2 3.958553 3.727874 3.763886 3.531719
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
## 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 53 54 55 56 57 58 59 60
## 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 800.7523 278.4449
## (between_SS / total_SS = 85.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
## Plot the clusters on the PCA
plot(pca_cytokines$x[,1], pca_cytokines$x[,2],
col = k_cluster$cluster,
pch = 19,
main = "K-means Clustering on PCA",
xlab = "PC1",
ylab = "PC2")

# 6. Make a Heat Map
heatmap(analyte_df, scale = "none", Rowv = NA, Colv = NA)

heatmap(analyte_df, scale = "none") # With the dendogram

## Make a heat map with pheatmap package
annotation <- data.frame("Treatment" = cytokine_df$Group)
rownames(annotation) <- rownames(cytokine_df)
head(annotation)
## Treatment
## 1 Control
## 2 Control
## 3 Control
## 4 Control
## 5 Control
## 6 Control
ann_colors <- list(Group = c(Control = "blue", Sepsis = "red"))
pheatmap(analyte_df, scale = "none", annotation_row = annotation, annotation_colors = ann_colors, show_rownames = FALSE)
