# data
library(here) # The location management : import/export
library(tidyverse) # data management
#graphs
library(ggplot2) # graphs
library(RColorBrewer) # color palette
library(ggpubr) # combine graphs
# analysis
library(moments) # for skewnness and kurtosis
library(rstatix) # statistical tests
# publication ready table
library(gtsummary) # table generation
library(flextable) # save the tableCase Study 2
About the Data
The data set includes counts of T cells, CD4+ T cells, CD8+ T cells, regulatory T cells (Tregs), T helper cells (Th), B cells, natural killer (NK) cells, and CD45.1+ cells, which are key indicators of immune function from three experimental groups: PBS Control Group, Isotope Control Group and Treatment Group. This data set is simulated using the example discussed the course by Peter Mac Data Science Training. To download the data set, click here
The preview of the data:
Loading R Packages
To use the following R packages in your code, you’ll need to install them first. Installation requires an active internet connection. Once installed, you do not need an internet connection to load the packages in future sessions.
Data Import
To import the data file named ‘case_study2.csv’, which is stored in the ‘data’ folder of your project directory, use the following R code.
D <- read.csv(here("data", "case_study2.csv"))
glimpse(D)Rows: 45
Columns: 8
$ Group <chr> "PBS", "PBS", "PBS", "PBS", "PBS", "PBS", "PBS", "PBS", "PBS",…
$ CD4T <int> 4882, 5004, 4936, 4742, 4957, 4924, 4908, 5011, 4849, 4926, 47…
$ CD8T <int> 1003, 1005, 1078, 1008, 1029, 965, 1071, 1007, 965, 965, 1045,…
$ T_cells <int> 5885, 6010, 6014, 5750, 5986, 5889, 5979, 6018, 5814, 5891, 58…
$ Treg <int> 338, 351, 359, 327, 365, 347, 344, 362, 315, 372, 358, 331, 27…
$ Th <int> 2773, 3125, 3160, 3066, 3130, 3015, 2992, 3051, 3221, 3244, 30…
$ NK <int> 309, 330, 354, 453, 268, 335, 366, 289, 444, 337, 223, 440, 32…
$ CD45.1 <dbl> 8278.534, 8151.538, 7882.493, 7957.416, 8012.542, 7858.919, 79…
sum(is.na(D)) # number of missing values[1] 0
summary(D) Group CD4T CD8T T_cells
Length:45 Min. :4742 Min. : 965 Min. :5750
Class :character 1st Qu.:4920 1st Qu.:1004 1st Qu.:5929
Mode :character Median :5018 Median :1029 Median :6026
Mean :5273 Mean :1040 Mean :6313
3rd Qu.:5827 3rd Qu.:1071 3rd Qu.:6886
Max. :6106 Max. :1186 Max. :7127
Treg Th NK CD45.1
Min. :246.0 Min. :2423 Min. :196.0 Min. : 313.3
1st Qu.:301.0 1st Qu.:2687 1st Qu.:309.0 1st Qu.: 670.1
Median :328.0 Median :2773 Median :375.0 Median :7820.1
Mean :324.3 Mean :2826 Mean :360.8 Mean :5490.9
3rd Qu.:347.0 3rd Qu.:3015 3rd Qu.:413.0 3rd Qu.:8012.5
Max. :391.0 Max. :3244 Max. :512.0 Max. :8668.7
By default, the categories of the Group variable in data set D are arranged in alphabetical order, which may not reflect the desired logical order for analysis or visualization.
table(D$Group)
Isotope PBS Treatment
15 15 15
The following R code ensures that the Group variable is treated as a factor with a specified order of levels: "PBS", "Isotope", and "Treatment".
D <- D |> mutate(Group = factor(Group, levels = c("PBS", "Isotope", "Treatment")))
table(D$Group)
PBS Isotope Treatment
15 15 15
Descriptive Statistics
This R code calculates descriptive statistics immune cell type variables in the data set D, grouped by the Group variable.
tb_CD4T <- D |> group_by(Group) |>
summarise(mean = mean(CD4T),
sd = sd(CD4T),
median = median(CD4T),
skew = skewness(CD4T),
kurt = kurtosis(CD4T))
tb_CD8T <- D |> group_by(Group) |>
summarise(mean = mean(CD8T),
sd = sd(CD8T),
median = median(CD8T),
skew = skewness(CD8T),
kurt = kurtosis(CD8T))
tb_T_cells <- D |> group_by(Group) |>
summarise(mean = mean(T_cells),
sd = sd(T_cells),
median = median(T_cells),
skew = skewness(T_cells),
kurt = kurtosis(T_cells))
tb_Treg <- D |> group_by(Group) |>
summarise(mean = mean(Treg),
sd = sd(Treg),
median = median(Treg),
skew = skewness(Treg),
kurt = kurtosis(Treg))
tb_Th <- D |> group_by(Group) |>
summarise(mean = mean(Th),
sd = sd(Th),
median = median(Th),
skew = skewness(Th),
kurt = kurtosis(Th))
tb_NK <- D |> group_by(Group) |>
summarise(mean = mean(NK),
sd = sd(NK),
median = median(NK),
skew = skewness(NK),
kurt = kurtosis(NK))
tb_CD45.1 <- D |> group_by(Group) |>
summarise(mean = mean(CD45.1),
sd = sd(CD45.1),
median = median(CD45.1),
skew = skewness(CD45.1),
kurt = kurtosis(CD45.1))The output of the above code chunk is a tibble containing the descriptive statistics for each variable, including the mean, standard deviation, median, skewness, kurtosis.
print(tb_CD4T)# A tibble: 3 × 6
Group mean sd median skew kurt
<fct> <dbl> <dbl> <int> <dbl> <dbl>
1 PBS 4903. 92.3 4908 -0.0528 2.46
2 Isotope 5010. 105. 4990 0.700 2.72
3 Treatment 5907. 103. 5903 0.381 2.29
print(tb_CD8T)# A tibble: 3 × 6
Group mean sd median skew kurt
<fct> <dbl> <dbl> <int> <dbl> <dbl>
1 PBS 1024. 43.0 1008 0.0302 1.71
2 Isotope 1047. 66.8 1024 0.784 2.37
3 Treatment 1049. 34.3 1044 0.0558 1.81
print(tb_T_cells)# A tibble: 3 × 6
Group mean sd median skew kurt
<fct> <dbl> <dbl> <int> <dbl> <dbl>
1 PBS 5927. 104. 5906 0.468 3.07
2 Isotope 6057 132. 6020 0.697 2.20
3 Treatment 6957. 99.9 6953 -0.175 2.89
print(tb_Treg)# A tibble: 3 × 6
Group mean sd median skew kurt
<fct> <dbl> <dbl> <int> <dbl> <dbl>
1 PBS 344. 26.3 347 -1.20 4.52
2 Isotope 331. 27.9 328 0.903 3.03
3 Treatment 299. 33.8 299 -0.0480 1.84
print(tb_Th)# A tibble: 3 × 6
Group mean sd median skew kurt
<fct> <dbl> <dbl> <int> <dbl> <dbl>
1 PBS 3080. 117. 3109 -1.05 4.22
2 Isotope 2654. 118. 2695 -0.391 2.24
3 Treatment 2744. 137. 2754 -0.326 2.31
print(tb_NK)# A tibble: 3 × 6
Group mean sd median skew kurt
<fct> <dbl> <dbl> <int> <dbl> <dbl>
1 PBS 348. 66.9 337 0.0890 2.31
2 Isotope 409. 32.1 407 0.748 4.57
3 Treatment 325. 94.2 308 0.413 2.26
print(tb_CD45.1)# A tibble: 3 × 6
Group mean sd median skew kurt
<fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 PBS 7999. 367. 7990. -0.117 2.95
2 Isotope 7921. 136. 7947. -0.457 2.21
3 Treatment 553. 144. 537. -0.00657 1.78
Assessment of Normality of Assumption
This code assesses the normality assumption for various variables in the D dataset, grouped by the Group variable, using the Shapiro-Wilk test. The Shapiro-Wilk test is a statistical test to check whether a variable is normally distributed. This is done using the {rstatix} package in R, which provides a convenient function shapiro_test() for performing the test.
D |> group_by(Group) |> shapiro_test(CD4T)# A tibble: 3 × 4
Group variable statistic p
<fct> <chr> <dbl> <dbl>
1 PBS CD4T 0.976 0.934
2 Isotope CD4T 0.937 0.347
3 Treatment CD4T 0.967 0.818
D |> group_by(Group) |> shapiro_test(CD8T)# A tibble: 3 × 4
Group variable statistic p
<fct> <chr> <dbl> <dbl>
1 PBS CD8T 0.911 0.142
2 Isotope CD8T 0.875 0.0400
3 Treatment CD8T 0.946 0.471
D |> group_by(Group) |> shapiro_test(T_cells)# A tibble: 3 × 4
Group variable statistic p
<fct> <chr> <dbl> <dbl>
1 PBS T_cells 0.963 0.738
2 Isotope T_cells 0.886 0.0582
3 Treatment T_cells 0.970 0.859
D |> group_by(Group) |> shapiro_test(Treg)# A tibble: 3 × 4
Group variable statistic p
<fct> <chr> <dbl> <dbl>
1 PBS Treg 0.911 0.141
2 Isotope Treg 0.905 0.115
3 Treatment Treg 0.952 0.561
D |> group_by(Group) |> shapiro_test(Th)# A tibble: 3 × 4
Group variable statistic p
<fct> <chr> <dbl> <dbl>
1 PBS Th 0.924 0.223
2 Isotope Th 0.957 0.647
3 Treatment Th 0.955 0.609
D |> group_by(Group) |> shapiro_test(NK)# A tibble: 3 × 4
Group variable statistic p
<fct> <chr> <dbl> <dbl>
1 PBS NK 0.946 0.465
2 Isotope NK 0.933 0.304
3 Treatment NK 0.961 0.706
D |> group_by(Group) |> shapiro_test(CD45.1)# A tibble: 3 × 4
Group variable statistic p
<fct> <chr> <dbl> <dbl>
1 PBS CD45.1 0.978 0.957
2 Isotope CD45.1 0.952 0.560
3 Treatment CD45.1 0.951 0.548
Assessment of Normality of Assumption
The following code assesses the homogeneity of variances (homoscedasticity) across different groups for several variables in the D dataset, using Levene’s Test from the {rstatix} package. Levene’s Test is used to evaluate whether the variances are equal across the groups, which is an important assumption for one way ANOVA.
D |> levene_test(CD4T ~ Group)# A tibble: 1 × 4
df1 df2 statistic p
<int> <int> <dbl> <dbl>
1 2 42 0.202 0.818
D |> levene_test(CD8T ~ Group)# A tibble: 1 × 4
df1 df2 statistic p
<int> <int> <dbl> <dbl>
1 2 42 1.16 0.323
D |> levene_test(T_cells ~ Group)# A tibble: 1 × 4
df1 df2 statistic p
<int> <int> <dbl> <dbl>
1 2 42 0.520 0.598
D |> levene_test(Treg ~ Group)# A tibble: 1 × 4
df1 df2 statistic p
<int> <int> <dbl> <dbl>
1 2 42 1.02 0.370
D |> levene_test(Th ~ Group)# A tibble: 1 × 4
df1 df2 statistic p
<int> <int> <dbl> <dbl>
1 2 42 0.297 0.745
D |> levene_test(NK ~ Group)# A tibble: 1 × 4
df1 df2 statistic p
<int> <int> <dbl> <dbl>
1 2 42 6.08 0.00480
D |> levene_test(CD45.1 ~ Group)# A tibble: 1 × 4
df1 df2 statistic p
<int> <int> <dbl> <dbl>
1 2 42 5.22 0.00942
Visualization Using Box plots
The basic boxplot code is:
ggplot(D, aes(x = Group, y = CD4T,col = Group)) +
geom_boxplot() +
labs(title = "Boxplot of CD4+ T cells by Group",
x = "Group",
y = "CD4+ T cells") +
theme_minimal()the color scheme using the {RColorBrewer} package, which provides a variety of color palettes. One such palette, “Set2” is used in the following code
ggplot(D, aes(x = Group, y = CD4T,fill = Group)) +
geom_boxplot() +
scale_fill_brewer(palette = "Set2") +
labs(title = "Boxplot of CD4+ T cells by Group",
x = "Group",
y = "CD4+ T cells") +
theme_minimal()The list of available color palettes, along with their details, can be obtained using the following code:
brewer.pal.info maxcolors category colorblind
BrBG 11 div TRUE
PiYG 11 div TRUE
PRGn 11 div TRUE
PuOr 11 div TRUE
RdBu 11 div TRUE
RdGy 11 div FALSE
RdYlBu 11 div TRUE
RdYlGn 11 div FALSE
Spectral 11 div FALSE
Accent 8 qual FALSE
Dark2 8 qual TRUE
Paired 12 qual TRUE
Pastel1 9 qual FALSE
Pastel2 8 qual FALSE
Set1 9 qual FALSE
Set2 8 qual TRUE
Set3 12 qual FALSE
Blues 9 seq TRUE
BuGn 9 seq TRUE
BuPu 9 seq TRUE
GnBu 9 seq TRUE
Greens 9 seq TRUE
Greys 9 seq TRUE
Oranges 9 seq TRUE
OrRd 9 seq TRUE
PuBu 9 seq TRUE
PuBuGn 9 seq TRUE
PuRd 9 seq TRUE
Purples 9 seq TRUE
RdPu 9 seq TRUE
Reds 9 seq TRUE
YlGn 9 seq TRUE
YlGnBu 9 seq TRUE
YlOrBr 9 seq TRUE
YlOrRd 9 seq TRUE
This boxplot includes black borders (color = "black") and a semi-transparent fill (alpha = 0.6) for better visual clarity.
ggplot(D, aes(x = Group, y = CD4T,fill = Group)) +
geom_boxplot(color = "black", alpha = 0.6) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Boxplot of CD4+ T cells by Group",
x = "Group",
y = "CD4+ T cells") +
theme_minimal()To add jittered data Points on top:
ggplot(D, aes(x = Group, y = CD4T, fill = Group)) +
geom_boxplot(color = "black", alpha = 0.6) +
geom_jitter(width = 0.2, color = "black", alpha = 0.6) + # Add jittered dots
scale_fill_brewer(palette = "Set2") +
theme_minimal()One Way ANOVA
This code performs one-way ANOVA tests to assess whether there are significant differences in the means of various immune cell counts (CD4T, CD8T, T_cells, Treg, Th, NK, CD45.1) across different groups in the D data set. The anova_test() function from the {rstatix} package is used to conduct the ANOVA for each variable, with the detailed = T providing detailed output, including the F-statistic, p-value, and additional information
D |> anova_test(CD4T ~ Group, detailed = T)ANOVA Table (type II tests)
Effect SSn SSd DFn DFd F p p<.05 ges
1 Group 9135584 421348.8 2 42 455.317 3.39e-29 * 0.956
D |> anova_test(CD8T ~ Group, detailed = T)ANOVA Table (type II tests)
Effect SSn SSd DFn DFd F p p<.05 ges
1 Group 5754.178 104945.1 2 42 1.151 0.326 0.052
D |> anova_test(T_cells ~ Group, detailed = T)ANOVA Table (type II tests)
Effect SSn SSd DFn DFd F p p<.05 ges
1 Group 9430769 536130.7 2 42 369.399 2.21e-27 * 0.946
D |> anova_test(Treg ~ Group, detailed = T)ANOVA Table (type II tests)
Effect SSn SSd DFn DFd F p p<.05 ges
1 Group 16116.13 36599.87 2 42 9.247 0.00047 * 0.306
D |> anova_test(Th ~ Group, detailed = T)ANOVA Table (type II tests)
Effect SSn SSd DFn DFd F p p<.05 ges
1 Group 1508775 651836.4 2 42 48.608 1.18e-11 * 0.698
D |> anova_test(NK ~ Group, detailed = T)ANOVA Table (type II tests)
Effect SSn SSd DFn DFd F p p<.05 ges
1 Group 55901.64 201224.1 2 42 5.834 0.006 * 0.217
D |> anova_test(CD45.1 ~ Group, detailed = T)ANOVA Table (type II tests)
Effect SSn SSd DFn DFd F p p<.05 ges
1 Group 548583411 2432905 2 42 4735.184 3.5e-50 * 0.996
For CD8T, the p-value is 0.326, which is greater than the significance level of 0.05. This indicates that there is no significant difference in the means of CD8T across the groups. As a result, no post-hoc tests are conducted for CD8T.
For the other variables (CD4T, T_cells, Treg, Th, NK, CD45.1), the p-values are less than 0.05, suggesting significant differences between the group means, and post-hoc tests are performed to identify which groups are different.
D |> tukey_hsd(CD4T ~ Group)# A tibble: 3 × 9
term group1 group2 null.value estimate conf.low conf.high p.adj
* <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Group PBS Isotope 0 107. 18.1 196. 1.5 e- 2
2 Group PBS Treatment 0 1005. 916. 1094. 1.06e-12
3 Group Isotope Treatment 0 898. 809. 987. 1.06e-12
# ℹ 1 more variable: p.adj.signif <chr>
D |> tukey_hsd(T_cells ~ Group, detailed = T)# A tibble: 3 × 9
term group1 group2 null.value estimate conf.low conf.high p.adj
* <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Group PBS Isotope 0 130. 29.8 230. 8.22e- 3
2 Group PBS Treatment 0 1030. 929. 1130. 1.06e-12
3 Group Isotope Treatment 0 900. 799. 1000. 1.06e-12
# ℹ 1 more variable: p.adj.signif <chr>
D |> tukey_hsd(Treg ~ Group, detailed = T)# A tibble: 3 × 9
term group1 group2 null.value estimate conf.low conf.high p.adj
* <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Group PBS Isotope 0 -13.1 -39.3 13.1 0.449
2 Group PBS Treatment 0 -45.1 -71.3 -18.9 0.000416
3 Group Isotope Treatment 0 -31.9 -58.1 -5.75 0.0136
# ℹ 1 more variable: p.adj.signif <chr>
D |> tukey_hsd(Th ~ Group, detailed = T)# A tibble: 3 × 9
term group1 group2 null.value estimate conf.low conf.high p.adj
* <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Group PBS Isotope 0 -425. -536. -315. 2.5 e-11
2 Group PBS Treatment 0 -336. -446. -225. 1.24e- 8
3 Group Isotope Treatment 0 89.6 -20.9 200. 1.32e- 1
# ℹ 1 more variable: p.adj.signif <chr>
D |> tukey_hsd(NK ~ Group, detailed = T)# A tibble: 3 × 9
term group1 group2 null.value estimate conf.low conf.high p.adj
* <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Group PBS Isotope 0 60.5 -0.938 122. 0.0544
2 Group PBS Treatment 0 -23.1 -84.5 38.3 0.634
3 Group Isotope Treatment 0 -83.6 -145. -22.2 0.00538
# ℹ 1 more variable: p.adj.signif <chr>
D |> tukey_hsd(CD45.1 ~ Group, detailed = T)# A tibble: 3 × 9
term group1 group2 null.value estimate conf.low conf.high p.adj
* <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Group PBS Isotope 0 -77.7 -291. 136. 6.53e- 1
2 Group PBS Treatment 0 -7445. -7659. -7232. 1.06e-12
3 Group Isotope Treatment 0 -7367. -7581. -7154. 1.06e-12
# ℹ 1 more variable: p.adj.signif <chr>
Publication Ready Table
tb <- D |> tbl_summary(
by = Group,
statistic = ~"{mean} ({sd})",
digits = ~2)|>
add_p(test = ~ "aov") To save the table as a Word document titled “summary_table1” in the “results” folder, you can use the following code.
tb1 <- as_flex_table(tb)
save_as_docx(tb1, path = here("results", "summary_table1.docx"))This code provides an illustrative example of comparing the distribution of CD4T across different groups in the D data set. The ggboxplot() function is used to create a boxplot, and pairwise comparisons between groups are performed using t-tests with the stat_compare_means() function. This approach is applied to CD4T, and similar analyses can be conducted for the remaining variables by modifying the y argument to represent the other variables of interest (e.g., CD8T, T_cells, etc.).
ggboxplot(D, x = "Group", y = "CD4T", fill = "Group") +
stat_compare_means(comparisons = list(c("PBS", "Isotope"), c("Isotope", "Treatment"), c("PBS", "Treatment")),
method = "t.test", label = "p.signif", size = 5) +
annotate("text", x = 1, y = min(D$CD4T) * 0.95,
label = "* p <= 0.05; ** p <= 0.01; *** p <= 0.001; **** p <= 0.0001",
size = 4, hjust = 0)+
theme_pubr()