Case Study 2

About the Data

The data set includes counts of T cells, CD4+ T cells, CD8+ T cells, regulatory T cells (Tregs), T helper cells (Th), B cells, natural killer (NK) cells, and CD45.1+ cells, which are key indicators of immune function from three experimental groups: PBS Control Group, Isotope Control Group and Treatment Group. This data set is simulated using the example discussed the course by Peter Mac Data Science Training. To download the data set, click here

The preview of the data:

Loading R Packages

To use the following R packages in your code, you’ll need to install them first. Installation requires an active internet connection. Once installed, you do not need an internet connection to load the packages in future sessions.

# data
library(here)      # The location management : import/export
library(tidyverse) # data management

#graphs
library(ggplot2)   # graphs
library(RColorBrewer) # color palette
library(ggpubr) # combine graphs

# analysis
library(moments)   # for skewnness and kurtosis
library(rstatix)   # statistical tests

# publication ready table
library(gtsummary) # table generation
library(flextable) # save the table

Data Import

To import the data file named ‘case_study2.csv’, which is stored in the ‘data’ folder of your project directory, use the following R code.

D <- read.csv(here("data", "case_study2.csv"))
glimpse(D)
Rows: 45
Columns: 8
$ Group   <chr> "PBS", "PBS", "PBS", "PBS", "PBS", "PBS", "PBS", "PBS", "PBS",…
$ CD4T    <int> 4882, 5004, 4936, 4742, 4957, 4924, 4908, 5011, 4849, 4926, 47…
$ CD8T    <int> 1003, 1005, 1078, 1008, 1029, 965, 1071, 1007, 965, 965, 1045,…
$ T_cells <int> 5885, 6010, 6014, 5750, 5986, 5889, 5979, 6018, 5814, 5891, 58…
$ Treg    <int> 338, 351, 359, 327, 365, 347, 344, 362, 315, 372, 358, 331, 27…
$ Th      <int> 2773, 3125, 3160, 3066, 3130, 3015, 2992, 3051, 3221, 3244, 30…
$ NK      <int> 309, 330, 354, 453, 268, 335, 366, 289, 444, 337, 223, 440, 32…
$ CD45.1  <dbl> 8278.534, 8151.538, 7882.493, 7957.416, 8012.542, 7858.919, 79…
sum(is.na(D)) # number of missing values
[1] 0
summary(D)
    Group                CD4T           CD8T         T_cells    
 Length:45          Min.   :4742   Min.   : 965   Min.   :5750  
 Class :character   1st Qu.:4920   1st Qu.:1004   1st Qu.:5929  
 Mode  :character   Median :5018   Median :1029   Median :6026  
                    Mean   :5273   Mean   :1040   Mean   :6313  
                    3rd Qu.:5827   3rd Qu.:1071   3rd Qu.:6886  
                    Max.   :6106   Max.   :1186   Max.   :7127  
      Treg             Th             NK            CD45.1      
 Min.   :246.0   Min.   :2423   Min.   :196.0   Min.   : 313.3  
 1st Qu.:301.0   1st Qu.:2687   1st Qu.:309.0   1st Qu.: 670.1  
 Median :328.0   Median :2773   Median :375.0   Median :7820.1  
 Mean   :324.3   Mean   :2826   Mean   :360.8   Mean   :5490.9  
 3rd Qu.:347.0   3rd Qu.:3015   3rd Qu.:413.0   3rd Qu.:8012.5  
 Max.   :391.0   Max.   :3244   Max.   :512.0   Max.   :8668.7  

By default, the categories of the Group variable in data set D are arranged in alphabetical order, which may not reflect the desired logical order for analysis or visualization.

table(D$Group)

  Isotope       PBS Treatment 
       15        15        15 

The following R code ensures that the Group variable is treated as a factor with a specified order of levels: "PBS", "Isotope", and "Treatment".

D <- D |> mutate(Group = factor(Group, levels = c("PBS", "Isotope", "Treatment")))
table(D$Group)

      PBS   Isotope Treatment 
       15        15        15 

Descriptive Statistics

This R code calculates descriptive statistics immune cell type variables in the data set D, grouped by the Group variable.

tb_CD4T <-  D |> group_by(Group) |> 
  summarise(mean = mean(CD4T),
            sd = sd(CD4T),
            median = median(CD4T),
            skew = skewness(CD4T),
            kurt = kurtosis(CD4T))

tb_CD8T <-  D |> group_by(Group) |> 
  summarise(mean = mean(CD8T),
            sd = sd(CD8T),
            median = median(CD8T),
            skew = skewness(CD8T),
            kurt = kurtosis(CD8T))

tb_T_cells <-  D |> group_by(Group) |> 
  summarise(mean = mean(T_cells),
            sd = sd(T_cells),
            median = median(T_cells),
            skew = skewness(T_cells),
            kurt = kurtosis(T_cells))

tb_Treg <-  D |> group_by(Group) |> 
  summarise(mean = mean(Treg),
            sd = sd(Treg),
            median = median(Treg),
            skew = skewness(Treg),
            kurt = kurtosis(Treg))

tb_Th <-  D |> group_by(Group) |> 
  summarise(mean = mean(Th),
            sd = sd(Th),
            median = median(Th),
            skew = skewness(Th),
            kurt = kurtosis(Th))

tb_NK <-  D |> group_by(Group) |> 
  summarise(mean = mean(NK),
            sd = sd(NK),
            median = median(NK),
            skew = skewness(NK),
            kurt = kurtosis(NK))

tb_CD45.1 <-  D |> group_by(Group) |> 
  summarise(mean = mean(CD45.1),
            sd = sd(CD45.1),
            median = median(CD45.1),
            skew = skewness(CD45.1),
            kurt = kurtosis(CD45.1))

The output of the above code chunk is a tibble containing the descriptive statistics for each variable, including the mean, standard deviation, median, skewness, kurtosis.

print(tb_CD4T)
# A tibble: 3 × 6
  Group      mean    sd median    skew  kurt
  <fct>     <dbl> <dbl>  <int>   <dbl> <dbl>
1 PBS       4903.  92.3   4908 -0.0528  2.46
2 Isotope   5010. 105.    4990  0.700   2.72
3 Treatment 5907. 103.    5903  0.381   2.29
print(tb_CD8T)
# A tibble: 3 × 6
  Group      mean    sd median   skew  kurt
  <fct>     <dbl> <dbl>  <int>  <dbl> <dbl>
1 PBS       1024.  43.0   1008 0.0302  1.71
2 Isotope   1047.  66.8   1024 0.784   2.37
3 Treatment 1049.  34.3   1044 0.0558  1.81
print(tb_T_cells)
# A tibble: 3 × 6
  Group      mean    sd median   skew  kurt
  <fct>     <dbl> <dbl>  <int>  <dbl> <dbl>
1 PBS       5927. 104.    5906  0.468  3.07
2 Isotope   6057  132.    6020  0.697  2.20
3 Treatment 6957.  99.9   6953 -0.175  2.89
print(tb_Treg)
# A tibble: 3 × 6
  Group      mean    sd median    skew  kurt
  <fct>     <dbl> <dbl>  <int>   <dbl> <dbl>
1 PBS        344.  26.3    347 -1.20    4.52
2 Isotope    331.  27.9    328  0.903   3.03
3 Treatment  299.  33.8    299 -0.0480  1.84
print(tb_Th)
# A tibble: 3 × 6
  Group      mean    sd median   skew  kurt
  <fct>     <dbl> <dbl>  <int>  <dbl> <dbl>
1 PBS       3080.  117.   3109 -1.05   4.22
2 Isotope   2654.  118.   2695 -0.391  2.24
3 Treatment 2744.  137.   2754 -0.326  2.31
print(tb_NK)
# A tibble: 3 × 6
  Group      mean    sd median   skew  kurt
  <fct>     <dbl> <dbl>  <int>  <dbl> <dbl>
1 PBS        348.  66.9    337 0.0890  2.31
2 Isotope    409.  32.1    407 0.748   4.57
3 Treatment  325.  94.2    308 0.413   2.26
print(tb_CD45.1)
# A tibble: 3 × 6
  Group      mean    sd median     skew  kurt
  <fct>     <dbl> <dbl>  <dbl>    <dbl> <dbl>
1 PBS       7999.  367.  7990. -0.117    2.95
2 Isotope   7921.  136.  7947. -0.457    2.21
3 Treatment  553.  144.   537. -0.00657  1.78

Assessment of Normality of Assumption

This code assesses the normality assumption for various variables in the D dataset, grouped by the Group variable, using the Shapiro-Wilk test. The Shapiro-Wilk test is a statistical test to check whether a variable is normally distributed. This is done using the {rstatix} package in R, which provides a convenient function shapiro_test() for performing the test.

D |> group_by(Group) |> shapiro_test(CD4T)
# A tibble: 3 × 4
  Group     variable statistic     p
  <fct>     <chr>        <dbl> <dbl>
1 PBS       CD4T         0.976 0.934
2 Isotope   CD4T         0.937 0.347
3 Treatment CD4T         0.967 0.818
D |> group_by(Group) |> shapiro_test(CD8T)
# A tibble: 3 × 4
  Group     variable statistic      p
  <fct>     <chr>        <dbl>  <dbl>
1 PBS       CD8T         0.911 0.142 
2 Isotope   CD8T         0.875 0.0400
3 Treatment CD8T         0.946 0.471 
D |> group_by(Group) |> shapiro_test(T_cells)
# A tibble: 3 × 4
  Group     variable statistic      p
  <fct>     <chr>        <dbl>  <dbl>
1 PBS       T_cells      0.963 0.738 
2 Isotope   T_cells      0.886 0.0582
3 Treatment T_cells      0.970 0.859 
D |> group_by(Group) |> shapiro_test(Treg)
# A tibble: 3 × 4
  Group     variable statistic     p
  <fct>     <chr>        <dbl> <dbl>
1 PBS       Treg         0.911 0.141
2 Isotope   Treg         0.905 0.115
3 Treatment Treg         0.952 0.561
D |> group_by(Group) |> shapiro_test(Th)
# A tibble: 3 × 4
  Group     variable statistic     p
  <fct>     <chr>        <dbl> <dbl>
1 PBS       Th           0.924 0.223
2 Isotope   Th           0.957 0.647
3 Treatment Th           0.955 0.609
D |> group_by(Group) |> shapiro_test(NK)
# A tibble: 3 × 4
  Group     variable statistic     p
  <fct>     <chr>        <dbl> <dbl>
1 PBS       NK           0.946 0.465
2 Isotope   NK           0.933 0.304
3 Treatment NK           0.961 0.706
D |> group_by(Group) |> shapiro_test(CD45.1)
# A tibble: 3 × 4
  Group     variable statistic     p
  <fct>     <chr>        <dbl> <dbl>
1 PBS       CD45.1       0.978 0.957
2 Isotope   CD45.1       0.952 0.560
3 Treatment CD45.1       0.951 0.548

Assessment of Normality of Assumption

The following code assesses the homogeneity of variances (homoscedasticity) across different groups for several variables in the D dataset, using Levene’s Test from the {rstatix} package. Levene’s Test is used to evaluate whether the variances are equal across the groups, which is an important assumption for one way ANOVA.

D |> levene_test(CD4T ~ Group)
# A tibble: 1 × 4
    df1   df2 statistic     p
  <int> <int>     <dbl> <dbl>
1     2    42     0.202 0.818
D |> levene_test(CD8T ~ Group)
# A tibble: 1 × 4
    df1   df2 statistic     p
  <int> <int>     <dbl> <dbl>
1     2    42      1.16 0.323
D |> levene_test(T_cells ~ Group)
# A tibble: 1 × 4
    df1   df2 statistic     p
  <int> <int>     <dbl> <dbl>
1     2    42     0.520 0.598
D |> levene_test(Treg ~ Group)
# A tibble: 1 × 4
    df1   df2 statistic     p
  <int> <int>     <dbl> <dbl>
1     2    42      1.02 0.370
D |> levene_test(Th ~ Group)
# A tibble: 1 × 4
    df1   df2 statistic     p
  <int> <int>     <dbl> <dbl>
1     2    42     0.297 0.745
D |> levene_test(NK ~ Group)
# A tibble: 1 × 4
    df1   df2 statistic       p
  <int> <int>     <dbl>   <dbl>
1     2    42      6.08 0.00480
D |> levene_test(CD45.1 ~ Group)
# A tibble: 1 × 4
    df1   df2 statistic       p
  <int> <int>     <dbl>   <dbl>
1     2    42      5.22 0.00942

Visualization Using Box plots

The basic boxplot code is:

ggplot(D, aes(x = Group, y = CD4T,col = Group)) +
  geom_boxplot() + 
  labs(title = "Boxplot of CD4+ T cells by Group",        
       x = "Group",                            
       y = "CD4+ T cells") +
  theme_minimal()

the color scheme using the {RColorBrewer} package, which provides a variety of color palettes. One such palette, “Set2” is used in the following code

ggplot(D, aes(x = Group, y = CD4T,fill = Group)) +
  geom_boxplot() + 
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Boxplot of CD4+ T cells by Group",        
       x = "Group",                            
       y = "CD4+ T cells") +
  theme_minimal()

The list of available color palettes, along with their details, can be obtained using the following code:

brewer.pal.info
         maxcolors category colorblind
BrBG            11      div       TRUE
PiYG            11      div       TRUE
PRGn            11      div       TRUE
PuOr            11      div       TRUE
RdBu            11      div       TRUE
RdGy            11      div      FALSE
RdYlBu          11      div       TRUE
RdYlGn          11      div      FALSE
Spectral        11      div      FALSE
Accent           8     qual      FALSE
Dark2            8     qual       TRUE
Paired          12     qual       TRUE
Pastel1          9     qual      FALSE
Pastel2          8     qual      FALSE
Set1             9     qual      FALSE
Set2             8     qual       TRUE
Set3            12     qual      FALSE
Blues            9      seq       TRUE
BuGn             9      seq       TRUE
BuPu             9      seq       TRUE
GnBu             9      seq       TRUE
Greens           9      seq       TRUE
Greys            9      seq       TRUE
Oranges          9      seq       TRUE
OrRd             9      seq       TRUE
PuBu             9      seq       TRUE
PuBuGn           9      seq       TRUE
PuRd             9      seq       TRUE
Purples          9      seq       TRUE
RdPu             9      seq       TRUE
Reds             9      seq       TRUE
YlGn             9      seq       TRUE
YlGnBu           9      seq       TRUE
YlOrBr           9      seq       TRUE
YlOrRd           9      seq       TRUE

This boxplot includes black borders (color = "black") and a semi-transparent fill (alpha = 0.6) for better visual clarity.

ggplot(D, aes(x = Group, y = CD4T,fill = Group)) +
  geom_boxplot(color = "black", alpha = 0.6) + 
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Boxplot of CD4+ T cells by Group",        
       x = "Group",                            
       y = "CD4+ T cells") +
  theme_minimal()

To add jittered data Points on top:

ggplot(D, aes(x = Group, y = CD4T, fill = Group)) +
  geom_boxplot(color = "black", alpha = 0.6) + 
  geom_jitter(width = 0.2, color = "black", alpha = 0.6) +  # Add jittered dots
  scale_fill_brewer(palette = "Set2") +
  theme_minimal()

One Way ANOVA

This code performs one-way ANOVA tests to assess whether there are significant differences in the means of various immune cell counts (CD4T, CD8T, T_cells, Treg, Th, NK, CD45.1) across different groups in the D data set. The anova_test() function from the {rstatix} package is used to conduct the ANOVA for each variable, with the detailed = T providing detailed output, including the F-statistic, p-value, and additional information

D |>  anova_test(CD4T ~ Group, detailed = T)
ANOVA Table (type II tests)

  Effect     SSn      SSd DFn DFd       F        p p<.05   ges
1  Group 9135584 421348.8   2  42 455.317 3.39e-29     * 0.956
D |>  anova_test(CD8T ~ Group, detailed = T)
ANOVA Table (type II tests)

  Effect      SSn      SSd DFn DFd     F     p p<.05   ges
1  Group 5754.178 104945.1   2  42 1.151 0.326       0.052
D |>  anova_test(T_cells ~ Group, detailed = T)
ANOVA Table (type II tests)

  Effect     SSn      SSd DFn DFd       F        p p<.05   ges
1  Group 9430769 536130.7   2  42 369.399 2.21e-27     * 0.946
D |>  anova_test(Treg ~ Group, detailed = T)
ANOVA Table (type II tests)

  Effect      SSn      SSd DFn DFd     F       p p<.05   ges
1  Group 16116.13 36599.87   2  42 9.247 0.00047     * 0.306
D |>  anova_test(Th ~ Group, detailed = T)
ANOVA Table (type II tests)

  Effect     SSn      SSd DFn DFd      F        p p<.05   ges
1  Group 1508775 651836.4   2  42 48.608 1.18e-11     * 0.698
D |>  anova_test(NK ~ Group, detailed = T)
ANOVA Table (type II tests)

  Effect      SSn      SSd DFn DFd     F     p p<.05   ges
1  Group 55901.64 201224.1   2  42 5.834 0.006     * 0.217
D |>  anova_test(CD45.1 ~ Group, detailed = T)
ANOVA Table (type II tests)

  Effect       SSn     SSd DFn DFd        F       p p<.05   ges
1  Group 548583411 2432905   2  42 4735.184 3.5e-50     * 0.996

For CD8T, the p-value is 0.326, which is greater than the significance level of 0.05. This indicates that there is no significant difference in the means of CD8T across the groups. As a result, no post-hoc tests are conducted for CD8T.

For the other variables (CD4T, T_cells, Treg, Th, NK, CD45.1), the p-values are less than 0.05, suggesting significant differences between the group means, and post-hoc tests are performed to identify which groups are different.

D |> tukey_hsd(CD4T ~ Group)
# A tibble: 3 × 9
  term  group1  group2    null.value estimate conf.low conf.high    p.adj
* <chr> <chr>   <chr>          <dbl>    <dbl>    <dbl>     <dbl>    <dbl>
1 Group PBS     Isotope            0     107.     18.1      196. 1.5 e- 2
2 Group PBS     Treatment          0    1005.    916.      1094. 1.06e-12
3 Group Isotope Treatment          0     898.    809.       987. 1.06e-12
# ℹ 1 more variable: p.adj.signif <chr>
D |> tukey_hsd(T_cells ~ Group, detailed = T)
# A tibble: 3 × 9
  term  group1  group2    null.value estimate conf.low conf.high    p.adj
* <chr> <chr>   <chr>          <dbl>    <dbl>    <dbl>     <dbl>    <dbl>
1 Group PBS     Isotope            0     130.     29.8      230. 8.22e- 3
2 Group PBS     Treatment          0    1030.    929.      1130. 1.06e-12
3 Group Isotope Treatment          0     900.    799.      1000. 1.06e-12
# ℹ 1 more variable: p.adj.signif <chr>
D |> tukey_hsd(Treg ~ Group, detailed = T)
# A tibble: 3 × 9
  term  group1  group2    null.value estimate conf.low conf.high    p.adj
* <chr> <chr>   <chr>          <dbl>    <dbl>    <dbl>     <dbl>    <dbl>
1 Group PBS     Isotope            0    -13.1    -39.3     13.1  0.449   
2 Group PBS     Treatment          0    -45.1    -71.3    -18.9  0.000416
3 Group Isotope Treatment          0    -31.9    -58.1     -5.75 0.0136  
# ℹ 1 more variable: p.adj.signif <chr>
D |> tukey_hsd(Th ~ Group, detailed = T)
# A tibble: 3 × 9
  term  group1  group2    null.value estimate conf.low conf.high    p.adj
* <chr> <chr>   <chr>          <dbl>    <dbl>    <dbl>     <dbl>    <dbl>
1 Group PBS     Isotope            0   -425.    -536.      -315. 2.5 e-11
2 Group PBS     Treatment          0   -336.    -446.      -225. 1.24e- 8
3 Group Isotope Treatment          0     89.6    -20.9      200. 1.32e- 1
# ℹ 1 more variable: p.adj.signif <chr>
D |> tukey_hsd(NK ~ Group, detailed = T)
# A tibble: 3 × 9
  term  group1  group2    null.value estimate conf.low conf.high   p.adj
* <chr> <chr>   <chr>          <dbl>    <dbl>    <dbl>     <dbl>   <dbl>
1 Group PBS     Isotope            0     60.5   -0.938     122.  0.0544 
2 Group PBS     Treatment          0    -23.1  -84.5        38.3 0.634  
3 Group Isotope Treatment          0    -83.6 -145.        -22.2 0.00538
# ℹ 1 more variable: p.adj.signif <chr>
D |> tukey_hsd(CD45.1 ~ Group, detailed = T)
# A tibble: 3 × 9
  term  group1  group2    null.value estimate conf.low conf.high    p.adj
* <chr> <chr>   <chr>          <dbl>    <dbl>    <dbl>     <dbl>    <dbl>
1 Group PBS     Isotope            0    -77.7    -291.      136. 6.53e- 1
2 Group PBS     Treatment          0  -7445.    -7659.    -7232. 1.06e-12
3 Group Isotope Treatment          0  -7367.    -7581.    -7154. 1.06e-12
# ℹ 1 more variable: p.adj.signif <chr>

Publication Ready Table

tb <- D  |>  tbl_summary(
  by = Group, 
  statistic = ~"{mean} ({sd})", 
  digits = ~2)|> 
  add_p(test = ~ "aov") 

To save the table as a Word document titled “summary_table1” in the “results” folder, you can use the following code.

tb1 <- as_flex_table(tb)
save_as_docx(tb1, path = here("results", "summary_table1.docx"))

This code provides an illustrative example of comparing the distribution of CD4T across different groups in the D data set. The ggboxplot() function is used to create a boxplot, and pairwise comparisons between groups are performed using t-tests with the stat_compare_means() function. This approach is applied to CD4T, and similar analyses can be conducted for the remaining variables by modifying the y argument to represent the other variables of interest (e.g., CD8T, T_cells, etc.).

ggboxplot(D, x = "Group", y = "CD4T", fill = "Group") +
  stat_compare_means(comparisons = list(c("PBS", "Isotope"), c("Isotope", "Treatment"), c("PBS", "Treatment")),
                     method = "t.test", label = "p.signif", size = 5) + 
  annotate("text", x = 1, y = min(D$CD4T) * 0.95, 
           label = "* p <= 0.05; ** p <= 0.01; *** p <= 0.001; **** p <= 0.0001", 
           size = 4, hjust = 0)+
  theme_pubr()