Analysis of the ToothGrowth data in the R datasets
package.
knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(tidyr)
library(rstatix)
##
## Attachement du package : 'rstatix'
## L'objet suivant est masqué depuis 'package:stats':
##
## filter
library(ggpubr)
# Load data
data("ToothGrowth")
#Sample of data grouped by the variable dose
ToothGrowth %>% sample_n_by(dose, size = 3)
## # A tibble: 9 × 3
## len supp dose
## <dbl> <fct> <dbl>
## 1 16.5 OJ 0.5
## 2 10 OJ 0.5
## 3 10 VC 0.5
## 4 15.5 VC 1
## 5 14.5 VC 1
## 6 26.4 OJ 1
## 7 33.9 VC 2
## 8 26.7 VC 2
## 9 29.5 VC 2
# Quick summary of the data
summary(ToothGrowth)
## len supp dose
## Min. : 4.20 OJ:30 Min. :0.500
## 1st Qu.:13.07 VC:30 1st Qu.:0.500
## Median :19.25 Median :1.000
## Mean :18.81 Mean :1.167
## 3rd Qu.:25.27 3rd Qu.:2.000
## Max. :33.90 Max. :2.000
# Exploration of ToothGrowth$dose and ToothGrowth$len, since we know exactly what ToothGrowth$supp contains
unique(ToothGrowth$dose)
## [1] 0.5 1.0 2.0
unique(ToothGrowth$len)
## [1] 4.2 11.5 7.3 5.8 6.4 10.0 11.2 5.2 7.0 16.5 15.2 17.3 22.5 13.6 14.5
## [16] 18.8 15.5 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5 23.3 29.5 17.6 9.7 8.2
## [31] 9.4 19.7 20.0 25.2 25.8 21.2 27.3 22.4 24.5 24.8 30.9 29.4 23.0
my_plot_fun <- function(my_data,my_x) {
my_data <- as.data.frame(my_data)
my_data %>%
ggplot(aes(x=my_x, y=len)) +
theme(plot.title = element_text(color="#ADA717", size=18, face="bold.italic",hjust=0.5),
axis.title.x = element_text(color="#993333", size=18, face="bold"),
axis.text.x = element_text(color="#993333", size=14,vjust = 0),
axis.title.y = element_text(color="darkgreen", size=18, face="bold"),
axis.text.y = element_text(face="bold", color="darkgreen", size=14),
legend.text = element_text(size=12),
legend.title = element_text(size=16))}
dose and len,
grouped by supp.
# Conversion of ToothGrowth$dose into a factor
ToothGrowth$dose<-as.factor(ToothGrowth$dose)
mpt <-my_plot_fun(ToothGrowth,my_x = ToothGrowth$dose) +
geom_boxplot(aes(color = supp,group=dose), width = 0.6) +
geom_dotplot(aes(fill = as.factor(dose), color = supp,group=dose), binaxis='y', stackdir='center',
dotsize = 0.8,position = position_dodge(0.8),binwidth=1)+
scale_fill_manual( values = c("#00AFBB", "#E3B166","#A1A861"))+
scale_color_manual(values = c("red", "black")) + facet_grid(~ supp)+
labs(x = "Doses", y ="Toothe Length", fill = "Doses", color="Supplement delivery",
title = "Tooth length \n by dose amount",
caption = "Plot tooth length ('len') by the dose amount ('dose'), \n grouped by supplement delivery method ('supp')") +
theme(plot.caption = element_text(color = "darkblue", face = "italic", size = 12))
mpt
supp and len,
grouped by dose.
# Conversion of ToothGrowth$dose into a factor
ToothGrowth$dose<-as.factor(ToothGrowth$dose)
mpt2 <-my_plot_fun(ToothGrowth,my_x = ToothGrowth$supp) +
geom_boxplot(aes(color = dose,group=supp), width = 0.6) +
geom_dotplot(aes(fill = as.factor(supp), color = dose,group=supp), binaxis='y', stackdir='center',
dotsize = 0.8,position = position_dodge(0.8),binwidth=1)+
scale_fill_manual( values = c("#00AFBB", "#E3B166"))+
scale_color_manual(values = c("red", "black","darkgreen")) + facet_grid(~ dose)+
labs(x = "Supplement delivery method", y ="Toothe Length", fill = "Supplement delivery", color="Doses",
title = "Tooth length \n by supplement delivery method",
caption = "Plot tooth length ('len') by the supplement delivery method ('supp'), \n grouped by dose amount ('dose')") +
theme(plot.caption = element_text(color = "darkblue", face = "italic", size = 12))
mpt2
supp, two samples :
OJ ;VC.
ToothGrowth %>% group_by(supp) %>%
get_summary_stats(len,type = "mean_sd")
## # A tibble: 2 × 5
## supp variable n mean sd
## <fct> <chr> <dbl> <dbl> <dbl>
## 1 OJ len 30 20.7 6.61
## 2 VC len 30 17.0 8.27
ToothGrowth %>% levene_test(len ~ supp)
## # A tibble: 1 × 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 1 58 1.21 0.275
ToothGrowth %>% group_by(supp) %>%
identify_outliers(len)
## [1] supp len dose is.outlier is.extreme
## <0 lignes> (ou 'row.names' de longueur nulle)
# Test d de SHAPIRO-WILK par groupes
ToothGrowth %>% group_by(supp) %>%
shapiro_test(len)
## # A tibble: 2 × 4
## supp variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 OJ len 0.918 0.0236
## 2 VC len 0.966 0.428
# QQ-plot by group
ToothGrowth %>% ggqqplot(x = "len",facet.by = "supp")
VC has better normality
than OJ.
my_stats <- ToothGrowth %>% group_by(dose) %>%
t_test(formula = len ~ supp,var.equal = TRUE, detailed = TRUE, paired = FALSE)
my_stats
## # A tibble: 3 × 16
## dose estimate estimate1 estimate2 .y. group1 group2 n1 n2 statistic
## * <fct> <dbl> <dbl> <dbl> <chr> <chr> <chr> <int> <int> <dbl>
## 1 0.5 5.25 13.2 7.98 len OJ VC 10 10 3.17
## 2 1 5.93 22.7 16.8 len OJ VC 10 10 4.03
## 3 2 -0.0800 26.1 26.1 len OJ VC 10 10 -0.0461
## # … with 6 more variables: p <dbl>, df <dbl>, conf.low <dbl>, conf.high <dbl>,
## # method <chr>, alternative <chr>
dose .5 : T-value of 3.1697, we get a p-value of 0.0053
;dose 1.0 : T-value of 4.0328, we get a p-value of
7.81^{-4};dose 2.0 : T-value of -0.0461, we get a p-value of
0.964 ;Therefor, the p-value is strongly significant (\(p~<~.01\)) for dose .5 and
dose 1.0. And is not significant for the dose
2.0 (\(p~>~.10\)).
subset(ToothGrowth, ToothGrowth$dose %in% c(.5)) %>% cohens_d(formula = len ~ supp, var.equal = TRUE, paired = FALSE)
## # A tibble: 1 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 len OJ VC 1.42 10 10 large
subset(ToothGrowth, ToothGrowth$dose %in% c(1)) %>% cohens_d(formula = len ~ supp, var.equal = TRUE, paired = FALSE)
## # A tibble: 1 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 len OJ VC 1.80 10 10 large
subset(ToothGrowth, ToothGrowth$dose %in% c(2)) %>% cohens_d(formula = len ~ supp, var.equal = TRUE, paired = FALSE)
## # A tibble: 1 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 len OJ VC -0.0206 10 10 negligible
dose .5, which is saying there is a
strong effect ;dose 1.0, which is saying there is a
strong effect ;dose 2.0, which is saying there is
a negligible effect.The mean of the group OJ and is 20.663 with a standard
deviation of 6.606. Whereas, the mean of the group VC is
16.963 with a standard deviation of 8.266. The STUDENT’s T-test show
there is :
dose .5 : strong evidence against \(H_{0}\) given that \(p\) value is 0.0053 (less than \(.01\)) with \(t\) value 3.17 and a degree of freedom
\(df~=~\) 18.
dose 1.0 : strong evidence against \(H_{0}\) given that \(p\) value is 8^{-4} (less than \(.01\)) with \(t\) value 4.033 and a degree of freedom
\(df~=~\) 18.
dose 2.0 : no evidence against \(H_{0}\) given that \(p\) value is 0.964 (more than \(.10\)) with \(t\) value -0.046 and a degree of freedom
\(df~=~\) 18.
dose 2.0 and false for dose .5 and
1.0. In addition to what was said above, we can notice that
: - for dose .5 : the value 0 is excluded of the confidence
interval of 95% which is [1.7702617,8.7297383] ;
dose .5 : the value 0 is excluded of the confidence
interval of 95% which is [2.8406919,9.0193081] ;
dose .5 : the value 0 is included in the confidence
interval of 95% which is [-3.7229985,3.5629985].
dose 2.0 and \(Type
~I\) error for dose .5 or 1.0.
my_stats <- my_stats %>% add_xy_position(x = "supp")
my_bxp <- ggboxplot(ToothGrowth, x = "supp", y = "len", add = "jitter", facet.by = "dose",
fill = "supp", palette = c("#00AFBB", "#E7B800")) +
theme(legend.position="bottom") +
stat_pvalue_manual(my_stats, tip.length = 0, label = "T-test p = {p}") +
scale_y_continuous(expand = expansion(mult = c(0.1, 0.15))) +
labs(x = "Supplement delivery", y ="Tooth length", fill = "Supplement delivery",
title = "Boxplot of the tooth length by group (supp) \n",
caption = "The plot resume some important statistics values and show \n
the distribution of the observations") +
theme(plot.caption = element_text(color = "darkblue", face = "italic", size = 12))
my_bxp
There is three groups in dose, three samples :
.5 ;1 ;2.ToothGrowth %>% group_by(dose) %>%
get_summary_stats(len,type = "mean_sd")
## # A tibble: 3 × 5
## dose variable n mean sd
## <fct> <chr> <dbl> <dbl> <dbl>
## 1 0.5 len 20 10.6 4.5
## 2 1 len 20 19.7 4.42
## 3 2 len 20 26.1 3.77
ToothGrowth_sub1 <- subset(ToothGrowth, ToothGrowth$dose %in% c(.5,1))
ToothGrowth_sub2 <- subset(ToothGrowth, ToothGrowth$dose %in% c(1.0,2.0))
ToothGrowth_sub2$dose<-as.factor(ToothGrowth_sub2$dose)
ToothGrowth_sub3 <- subset(ToothGrowth, ToothGrowth$dose %in% c(.5,2.0))
ToothGrowth_sub3$dose<-as.factor(ToothGrowth_sub3$dose)
# dose .5 and 1.0
ToothGrowth_sub1 %>% levene_test(len ~ dose)
## # A tibble: 1 × 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 1 38 0.287 0.595
# dose 1.0 and 2.0
ToothGrowth_sub2 %>% levene_test(len ~ dose)
## # A tibble: 1 × 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 1 38 1.61 0.212
# dose .5 and 2.0
ToothGrowth_sub3 %>% levene_test(len ~ dose)
## # A tibble: 1 × 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 1 38 0.300 0.587
The three p-value of the LEVENE test are not significant, which suggests
that there is not a significant difference between the variances of the
two groups. Indeed the p-values of LEVENE are 0.595, 0.212 and 0.587.
Therefore, we will use STUDENT’s T-test, which assumes the equality of
the two variances.
ToothGrowth %>% group_by(dose) %>%
identify_outliers(len)
## # A tibble: 1 × 5
## dose len supp is.outlier is.extreme
## <fct> <dbl> <fct> <lgl> <lgl>
## 1 0.5 21.5 OJ TRUE FALSE
The conclusion of this test is there is only one extreme value, with the
group dose = .5.
# Test d de SHAPIRO-WILK par groupes
ToothGrowth %>% group_by(dose) %>%
shapiro_test(len)
## # A tibble: 3 × 4
## dose variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 0.5 len 0.941 0.247
## 2 1 len 0.931 0.164
## 3 2 len 0.978 0.902
# QQ-plot by group
ToothGrowth %>% ggqqplot(x = "len",facet.by = "dose")
VC has better normality
than OJ.
# dose .5 and 1.0
ToothGrowth_sub1 %>% t_test(formula = len ~ dose,var.equal = TRUE)
## # A tibble: 1 × 8
## .y. group1 group2 n1 n2 statistic df p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 len 0.5 1 20 20 -6.48 38 0.000000127
# dose 1.0 and 2.0
ToothGrowth_sub2$dose<-as.numeric(ToothGrowth_sub2$dose)
ToothGrowth_sub2 %>% t_test(formula = len ~ dose,var.equal = TRUE)
## # A tibble: 1 × 8
## .y. group1 group2 n1 n2 statistic df p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 len 2 3 20 20 -4.90 38 0.0000181
# dose .5 and 2.0
ToothGrowth_sub3$dose<-as.numeric(ToothGrowth_sub3$dose)
ToothGrowth_sub3 %>% t_test(formula = len ~ dose,var.equal = TRUE)
## # A tibble: 1 × 8
## .y. group1 group2 n1 n2 statistic df p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 len 1 3 20 20 -11.8 38 2.84e-14
dose .5 and 1.0 : with a T-value -6.4766, we get
a p-value of 1.27^{-7}. Therefor, the p-value is significant (\(p~<~.01\)) ;dose 1.0 and 2.0 : with a T-value -4.9005, we get
a p-value of 1.81^{-5}. Therefor, the p-value is significant (\(p~<~.01\)) ;dose .5 and 2.0 : with a T-value -11.799, we get
a p-value of 2.84^{-14}. Therefor, the p-value is significant (\(p~<~.01\)).ToothGrowth %>% cohens_d(formula = len ~ dose, var.equal = TRUE)
## # A tibble: 3 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 len 0.5 1 -2.05 20 20 large
## 2 len 0.5 2 -3.73 20 20 large
## 3 len 1 2 -1.55 20 20 large
The mean of the group dose = .5 is 10.605 with a
standard deviation of 4.5. Whereas, the mean of the group dose =
1.0 is 19.735 with a standard deviation of 4.415. Finally, the
mean of the group dose = 2.0 is 26.1 with a standard
deviation of 3.774.
The STUDENT’s T-test shows :
dose .5 and 1.0 : there is strong evidence against
\(H_{0}\) given that the \(p\) 1.27^{-7} value is with \(t\) value -6.477 and a degree of freedom
\(df~=~\) 38.
dose 1.0 and 2.0 : there is strong evidence against
\(H_{0}\) given that the \(p\) 1.81^{-5} value is with \(t\) value -4.9 and a degree of freedom
\(df~=~\) 38.
dose 1.0 and 2.0 : there is strong evidence against
\(H_{0}\) given that the \(p\) 2.84^{-14} value is with \(t\) value -11.799 and a degree of freedom
\(df~=~\) 38.
all_stats <- ToothGrowth %>% t_test(formula = len ~ dose,var.equal = TRUE, detailed = TRUE,paired = FALSE)
all_stats <- all_stats %>% add_xy_position(x = "dose")
all_stats <- all_stats %>% mutate(y.position = c(29, 35, 39))
my_bxp2 <- ggboxplot(ToothGrowth, x = "dose", y = "len", add = "jitter",
fill = "dose", palette = c("#00AFBB", "#E7B800","#CF2406")) +
theme(legend.position="bottom") +
stat_pvalue_manual(all_stats, tip.length = 0, label = "T-test, p = {p}") +
labs(x = "Dose", y ="Tooth length", fill = "Dose",
title = "Boxplot of the tooth length by group (dose) \n",
caption = "The plot resume some important statistics values and show \n
the distribution of the observations") +
theme(plot.caption = element_text(color = "darkblue", face = "italic", size = 12))
my_bxp2