Intro to Stats in R

load libraries

library(palmerpenguins)
library(ggpubr)
library(jmv)
library(rstatix)
library(tidyverse)
library(here)
library(janitor)
library(ggeasy)
library(readxl)
library(gt)
library(gtsummary)

theme_set(theme_bw())

Stats in R using penguins dataset

Example 1a: Are male penguins heavier than female penguins? (categorical and continuous - 2 groups: male and female)

descriptive statistics

penguins <- penguins %>%
  na.omit()

body_mass_summary <- penguins %>%
  group_by(sex) %>%
  summarise(mean = mean(body_mass_g), 
            sd = sd(body_mass_g), 
            n = n(), 
            se = sd/sqrt(n))

body_mass_summary %>%
  gt() %>%
  fmt_number(
    columns = 2:5,
    decimals = 2
  )
sex mean sd n se
female 3,862.27 666.17 165.00 51.86
male 4,545.68 787.63 168.00 60.77

visualization

# bar plot
body_mass_summary %>%
  ggplot(aes(x = sex, y = mean, fill = sex)) +
  geom_col() +
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se),
                width=.2) +
  easy_text_size(15)

# boxplot
penguins %>%
  group_by(sex) %>%
  ggplot(aes(x = sex, y = body_mass_g, fill = sex)) +
  geom_boxplot(alpha = 0.5) +
  geom_jitter(alpha = 0.8, aes(color=sex)) +
  easy_text_size(15) 

statistics: we want to compare means between 2 groups… t test

female <- penguins %>%
  filter(sex == "female")

male <- penguins %>%
  filter(sex == "male")

t.test(female$body_mass_g, male$body_mass_g)
## 
##  Welch Two Sample t-test
## 
## data:  female$body_mass_g and male$body_mass_g
## t = -8.5545, df = 323.9, p-value = 4.794e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -840.5783 -526.2453
## sample estimates:
## mean of x mean of y 
##  3862.273  4545.685
# or we can use jmv package
ttestIS(formula = body_mass_g ~ sex, data = penguins) # independent samples t test. Dependent variable to the left of ~
## 
##  INDEPENDENT SAMPLES T-TEST
## 
##  Independent Samples T-Test                                              
##  ----------------------------------------------------------------------- 
##                                  Statistic      df          p            
##  ----------------------------------------------------------------------- 
##    body_mass_g    Student's t    -8.541720 <U+1D43>    331.0000    < .0000001   
##  ----------------------------------------------------------------------- 
##    <U+1D43> Levene's test is significant (p < .05), suggesting a violation
##    of the assumption of equal variances
  • mean of female=3862 and mean of male=4546
  • t test revealed this is a significant difference. state t value, df, p value

ggpubr: “publication ready”

plot_1a_pub <- ggboxplot(penguins, x = "sex", y = "body_mass_g",
                color = "sex", palette =c("darkblue", "darkred"),
                add = "jitter")

my_comparisons <- list(c("female", "male")) # has to be in a list
plot_1a_pub + 
  stat_compare_means(comparisons = my_comparisons, method="t.test") 

# Add t.test comparisons p-value (if more than 2 groups, you can "anova") 

Example 1b: Do different species have the same or different overall weight? (categorical and continuous - 3 groups)

https://www.datanovia.com/en/lessons/anova-in-r/

descriptive statistics

body_mass_summary <- penguins %>%
  group_by(species) %>%
  summarise(mean = mean(body_mass_g), 
            sd = sd(body_mass_g), 
            n = n(), 
            se = sd/sqrt(n))

body_mass_summary %>%
  gt()
species mean sd n se
Adelie 3706.164 458.6201 146 37.95567
Chinstrap 3733.088 384.3351 68 46.60747
Gentoo 5092.437 501.4762 119 45.97024
penguins %>%
  group_by(species) %>% 
  get_summary_stats(body_mass_g, type = "full") # rstatix
## # A tibble: 3 x 14
##   species  variable     n   min   max median    q1    q3   iqr   mad  mean    sd
##   <fct>    <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie   body_ma~   146  2850  4775   3700 3362.  4000  638.  445. 3706.  459.
## 2 Chinstr~ body_ma~    68  2700  4800   3700 3488.  3950  462.  371. 3733.  384.
## 3 Gentoo   body_ma~   119  3950  6300   5050 4700   5500  800   593. 5092.  501.
## # ... with 2 more variables: se <dbl>, ci <dbl>
penguins %>% 
  group_by(species) %>% 
  identify_outliers(body_mass_g)
## # A tibble: 2 x 10
##   species island bill_length_mm bill_depth_mm flipper_length_~ body_mass_g sex  
##   <fct>   <fct>           <dbl>         <dbl>            <int>       <int> <fct>
## 1 Chinst~ Dream            52            20.7              210        4800 male 
## 2 Chinst~ Dream            46.9          16.6              192        2700 fema~
## # ... with 3 more variables: year <int>, is.outlier <lgl>, is.extreme <lgl>

visualization

# identical code as in example 1a but now species rather than sex

plot_1b <- penguins %>%
  na.omit() %>%
  group_by(species) %>%
  ggplot(aes(x = species, y = body_mass_g, fill = species)) +
  geom_boxplot(alpha = 0.5) +
  geom_jitter(alpha = 0.8, aes(color=species)) +
  easy_text_size(15)

statistics: we want to compare means between 3 groups… anova

# we can still do multiple t tests (but should really use a correction if we do this)
my_comparisons <- list(c("Adelie", "Chinstrap"), c("Chinstrap", "Gentoo")) # has to be in a list
plot_1b + 
  stat_compare_means(comparisons = my_comparisons, method="t.test")

# it's better if we do an anova
plot_1b + 
  stat_compare_means(method="anova")

# Compute the analysis of variance
aov_1b <- aov(body_mass_g ~ species, data = penguins)
# Summary of the analysis
summary(aov_1b)
##              Df    Sum Sq  Mean Sq F value Pr(>F)    
## species       2 145190219 72595110   341.9 <2e-16 ***
## Residuals   330  70069447   212332                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# other ANOVA functions
# anovaOneW() # jmv
# anova_test() # rstatix
# welch_anova_test() # rstatix
  • Report ANOVA results We observed differences in body mass between the three species of Adelie (M=3706), Chinstrap (M=3733), and Gentoo (M=5092). An ANOVA showed that these differences between species were significant, i.e. there was a significant effect of the species on body mass of the penguins, F(2,330)=341.9, p<.001.

Example 2: are penguin bills length correlated with flipper length? (continuous and continuous)

descriptive statistics

penguins %>%
  summarise(across(contains("length"), mean)) %>%
  gt()
bill_length_mm flipper_length_mm
43.99279 200.967

visualization

ggplot(penguins, aes(bill_length_mm, flipper_length_mm)) +
  geom_point() +
  geom_smooth(method = "lm") 

statistics: correlation test and/or linear regression

cor(penguins$bill_length_mm, penguins$flipper_length_mm)
## [1] 0.6530956
cor.test(penguins$bill_length_mm, penguins$flipper_length_mm)
## 
##  Pearson's product-moment correlation
## 
## data:  penguins$bill_length_mm and penguins$flipper_length_mm
## t = 15.691, df = 331, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5868092 0.7106869
## sample estimates:
##       cor 
## 0.6530956
# linear regression
lr <- lm(bill_length_mm ~ flipper_length_mm, data = penguins)
summary(lr)
## 
## Call:
## lm(formula = bill_length_mm ~ flipper_length_mm, data = penguins)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6367 -2.6981 -0.5788  2.0663 19.0953 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -7.21856    3.27175  -2.206    0.028 *  
## flipper_length_mm  0.25482    0.01624  15.691   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.148 on 331 degrees of freedom
## Multiple R-squared:  0.4265, Adjusted R-squared:  0.4248 
## F-statistic: 246.2 on 1 and 331 DF,  p-value: < 2.2e-16
# R-squared value signifies that your model explains a good proportion of the variability in the dependent variable
  • regression results typically report: F-stat, dfs, p value, R^2

ggpubr: color by species

ggscatter(penguins, x = "bill_length_mm", y = "flipper_length_mm",
          size=1.5,
          add = "reg.line", # Add regression line
          conf.int = TRUE # Add confidence interval
          ) +
  stat_cor(label.x = 18)

Example 3: Do the number of penguin species vary by year? (categorical and categorical)

# descriptive statistics
peng_table <- table(penguins$species, penguins$year)

# visualization
ggplot(penguins, aes(year, ..count..)) +
  geom_bar(aes(fill = species), position = "dodge")

# chi squared test
chisq.test(peng_table)
## 
##  Pearson's Chi-squared test
## 
## data:  peng_table
## X-squared = 3.2711, df = 4, p-value = 0.5135