load libraries
library(palmerpenguins)
library(ggpubr)
library(jmv)
library(rstatix)
library(tidyverse)
library(here)
library(janitor)
library(ggeasy)
library(readxl)
library(gt)
library(gtsummary)
theme_set(theme_bw())
Stats in R using penguins dataset
Example 1a: Are male penguins heavier than female penguins? (categorical and continuous - 2 groups: male and female)
descriptive statistics
penguins <- penguins %>%
na.omit()
body_mass_summary <- penguins %>%
group_by(sex) %>%
summarise(mean = mean(body_mass_g),
sd = sd(body_mass_g),
n = n(),
se = sd/sqrt(n))
body_mass_summary %>%
gt() %>%
fmt_number(
columns = 2:5,
decimals = 2
)
| sex |
mean |
sd |
n |
se |
| female |
3,862.27 |
666.17 |
165.00 |
51.86 |
| male |
4,545.68 |
787.63 |
168.00 |
60.77 |
visualization
# bar plot
body_mass_summary %>%
ggplot(aes(x = sex, y = mean, fill = sex)) +
geom_col() +
geom_errorbar(aes(ymin = mean - se, ymax = mean + se),
width=.2) +
easy_text_size(15)

# boxplot
penguins %>%
group_by(sex) %>%
ggplot(aes(x = sex, y = body_mass_g, fill = sex)) +
geom_boxplot(alpha = 0.5) +
geom_jitter(alpha = 0.8, aes(color=sex)) +
easy_text_size(15)

statistics: we want to compare means between 2 groups… t test
female <- penguins %>%
filter(sex == "female")
male <- penguins %>%
filter(sex == "male")
t.test(female$body_mass_g, male$body_mass_g)
##
## Welch Two Sample t-test
##
## data: female$body_mass_g and male$body_mass_g
## t = -8.5545, df = 323.9, p-value = 4.794e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -840.5783 -526.2453
## sample estimates:
## mean of x mean of y
## 3862.273 4545.685
# or we can use jmv package
ttestIS(formula = body_mass_g ~ sex, data = penguins) # independent samples t test. Dependent variable to the left of ~
##
## INDEPENDENT SAMPLES T-TEST
##
## Independent Samples T-Test
## -----------------------------------------------------------------------
## Statistic df p
## -----------------------------------------------------------------------
## body_mass_g Student's t -8.541720 <U+1D43> 331.0000 < .0000001
## -----------------------------------------------------------------------
## <U+1D43> Levene's test is significant (p < .05), suggesting a violation
## of the assumption of equal variances
- mean of female=3862 and mean of male=4546
- t test revealed this is a significant difference. state t value, df, p value
ggpubr: “publication ready”
plot_1a_pub <- ggboxplot(penguins, x = "sex", y = "body_mass_g",
color = "sex", palette =c("darkblue", "darkred"),
add = "jitter")
my_comparisons <- list(c("female", "male")) # has to be in a list
plot_1a_pub +
stat_compare_means(comparisons = my_comparisons, method="t.test")

# Add t.test comparisons p-value (if more than 2 groups, you can "anova")
Example 1b: Do different species have the same or different overall weight? (categorical and continuous - 3 groups)
https://www.datanovia.com/en/lessons/anova-in-r/
descriptive statistics
body_mass_summary <- penguins %>%
group_by(species) %>%
summarise(mean = mean(body_mass_g),
sd = sd(body_mass_g),
n = n(),
se = sd/sqrt(n))
body_mass_summary %>%
gt()
| species |
mean |
sd |
n |
se |
| Adelie |
3706.164 |
458.6201 |
146 |
37.95567 |
| Chinstrap |
3733.088 |
384.3351 |
68 |
46.60747 |
| Gentoo |
5092.437 |
501.4762 |
119 |
45.97024 |
penguins %>%
group_by(species) %>%
get_summary_stats(body_mass_g, type = "full") # rstatix
## # A tibble: 3 x 14
## species variable n min max median q1 q3 iqr mad mean sd
## <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie body_ma~ 146 2850 4775 3700 3362. 4000 638. 445. 3706. 459.
## 2 Chinstr~ body_ma~ 68 2700 4800 3700 3488. 3950 462. 371. 3733. 384.
## 3 Gentoo body_ma~ 119 3950 6300 5050 4700 5500 800 593. 5092. 501.
## # ... with 2 more variables: se <dbl>, ci <dbl>
penguins %>%
group_by(species) %>%
identify_outliers(body_mass_g)
## # A tibble: 2 x 10
## species island bill_length_mm bill_depth_mm flipper_length_~ body_mass_g sex
## <fct> <fct> <dbl> <dbl> <int> <int> <fct>
## 1 Chinst~ Dream 52 20.7 210 4800 male
## 2 Chinst~ Dream 46.9 16.6 192 2700 fema~
## # ... with 3 more variables: year <int>, is.outlier <lgl>, is.extreme <lgl>
visualization
# identical code as in example 1a but now species rather than sex
plot_1b <- penguins %>%
na.omit() %>%
group_by(species) %>%
ggplot(aes(x = species, y = body_mass_g, fill = species)) +
geom_boxplot(alpha = 0.5) +
geom_jitter(alpha = 0.8, aes(color=species)) +
easy_text_size(15)
statistics: we want to compare means between 3 groups… anova
# we can still do multiple t tests (but should really use a correction if we do this)
my_comparisons <- list(c("Adelie", "Chinstrap"), c("Chinstrap", "Gentoo")) # has to be in a list
plot_1b +
stat_compare_means(comparisons = my_comparisons, method="t.test")

# it's better if we do an anova
plot_1b +
stat_compare_means(method="anova")

# Compute the analysis of variance
aov_1b <- aov(body_mass_g ~ species, data = penguins)
# Summary of the analysis
summary(aov_1b)
## Df Sum Sq Mean Sq F value Pr(>F)
## species 2 145190219 72595110 341.9 <2e-16 ***
## Residuals 330 70069447 212332
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# other ANOVA functions
# anovaOneW() # jmv
# anova_test() # rstatix
# welch_anova_test() # rstatix
- Report ANOVA results We observed differences in body mass between the three species of Adelie (M=3706), Chinstrap (M=3733), and Gentoo (M=5092). An ANOVA showed that these differences between species were significant, i.e. there was a significant effect of the species on body mass of the penguins, F(2,330)=341.9, p<.001.
Example 3: Do the number of penguin species vary by year? (categorical and categorical)
# descriptive statistics
peng_table <- table(penguins$species, penguins$year)
# visualization
ggplot(penguins, aes(year, ..count..)) +
geom_bar(aes(fill = species), position = "dodge")

# chi squared test
chisq.test(peng_table)
##
## Pearson's Chi-squared test
##
## data: peng_table
## X-squared = 3.2711, df = 4, p-value = 0.5135