references

dlookr

library(tidyverse)
library(dlookr)
library(palmerpenguins)
library(janitor)
library(gtsummary)
library(rstatix)
library(multcomp)



data diagnose

data <- janitor::clean_names(penguins)
glimpse(data)
## Rows: 344
## Columns: 8
## $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex               <fct> male, female, female, NA, female, male, female, male…
## $ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
skimr::skim(data)
Data summary
Name data
Number of rows 344
Number of columns 8
_______________________
Column type frequency:
factor 3
numeric 5
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
species 0 1.00 FALSE 3 Ade: 152, Gen: 124, Chi: 68
island 0 1.00 FALSE 3 Bis: 168, Dre: 124, Tor: 52
sex 11 0.97 FALSE 2 mal: 168, fem: 165

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
bill_length_mm 2 0.99 43.92 5.46 32.1 39.23 44.45 48.5 59.6 ▃▇▇▆▁
bill_depth_mm 2 0.99 17.15 1.97 13.1 15.60 17.30 18.7 21.5 ▅▅▇▇▂
flipper_length_mm 2 0.99 200.92 14.06 172.0 190.00 197.00 213.0 231.0 ▂▇▃▅▂
body_mass_g 2 0.99 4201.75 801.95 2700.0 3550.00 4050.00 4750.0 6300.0 ▃▇▆▃▂
year 0 1.00 2008.03 0.82 2007.0 2007.00 2008.00 2009.0 2009.0 ▇▁▇▁▇
dlookr::diagnose(data)
## # A tibble: 8 × 6
##   variables         types missing_count missing_percent unique_count unique_rate
##   <chr>             <chr>         <int>           <dbl>        <int>       <dbl>
## 1 species           fact…             0           0                3     0.00872
## 2 island            fact…             0           0                3     0.00872
## 3 bill_length_mm    nume…             2           0.581          165     0.480  
## 4 bill_depth_mm     nume…             2           0.581           81     0.235  
## 5 flipper_length_mm inte…             2           0.581           56     0.163  
## 6 body_mass_g       inte…             2           0.581           95     0.276  
## 7 sex               fact…            11           3.20             3     0.00872
## 8 year              inte…             0           0                3     0.00872



1. Frequency Table

1) janitor::tabyl

data %>% 
  tabyl(species
        , show_na = T
        , show_missing_levels = T) %>% 
  adorn_pct_formatting(digits = 2) %>% # %
  adorn_totals(where = "row") # add a total row
##    species   n percent
##     Adelie 152  44.19%
##  Chinstrap  68  19.77%
##     Gentoo 124  36.05%
##      Total 344       -

2) gtsummary::tbl_summary

data %>%
  dplyr::select(species) %>%
    tbl_summary(missing = "ifany", #if any missing? show
                statistic = list(all_continuous() ~ "{mean} ({sd})", # stats and format for continuous columns
                                 all_categorical() ~ "{n} / {N} ({p}%)"), # stats and format for categorical columns
                                 digits = all_continuous() ~ 1, # rounding for continuous columns
                                 type = all_categorical() ~ "categorical") # force all categorical levels to display
Characteristic N = 3441
species
    Adelie 152 / 344 (44%)
    Chinstrap 68 / 344 (20%)
    Gentoo 124 / 344 (36%)
1 n / N (%)

3) dlookr::diagnose_category()

data %>%
  diagnose_category(species)
## # A tibble: 3 × 6
##   variables levels        N  freq ratio  rank
##   <chr>     <chr>     <int> <int> <dbl> <int>
## 1 species   Adelie      344   152  44.2     1
## 2 species   Gentoo      344   124  36.0     2
## 3 species   Chinstrap   344    68  19.8     3



2. descriptive statistics

1) summarize

data %>% 
  summarize(MEAN = mean(bill_length_mm, na.rm = T)
            , SD = sd(bill_length_mm, na.rm = T)
            , SUM = sum(bill_length_mm, na.rm = T)
            , MAX = max(bill_length_mm, na.rm = T)
            , MIN = min(bill_length_mm, na.rm = T)
            , N = n()
            , "NA" = sum(is.na(bill_length_mm)))
## # A tibble: 1 × 7
##    MEAN    SD    SUM   MAX   MIN     N  `NA`
##   <dbl> <dbl>  <dbl> <dbl> <dbl> <int> <int>
## 1  43.9  5.46 15021.  59.6  32.1   344     2

by SEX

data %>%
  group_by(sex) %>% # by sex
  summarize(MEAN = mean(bill_length_mm, na.rm = T)
            , SD = sd(bill_length_mm, na.rm = T)
            , SUM = sum(bill_length_mm, na.rm = T)
            , MAX = max(bill_length_mm, na.rm = T)
            , MIN = min(bill_length_mm, na.rm = T)
            , N = n()
            , "NA" = sum(is.na(bill_length_mm)))
## # A tibble: 3 × 8
##   sex     MEAN    SD   SUM   MAX   MIN     N  `NA`
##   <fct>  <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 female  42.1  4.90 6946   58    32.1   165     0
## 2 male    45.9  5.37 7704.  59.6  34.6   168     0
## 3 <NA>    41.3  4.63  372.  47.3  34.1    11     2

2) rstatix::get_summary_stats

data %>%
  get_summary_stats(bill_length_mm)
## # A tibble: 1 × 13
##   variable        n   min   max median    q1    q3   iqr   mad  mean    sd    se
##   <chr>       <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bill_lengt…   342  32.1  59.6   44.4  39.2  48.5  9.28  7.04  43.9  5.46 0.295
## # … with 1 more variable: ci <dbl>
data %>%
  group_by(sex) %>%
  get_summary_stats(bill_length_mm)
## # A tibble: 3 × 14
##   sex    variable       n   min   max median    q1    q3   iqr   mad  mean    sd
##   <fct>  <chr>      <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 female bill_leng…   165  32.1  58     42.8  37.6  46.2  8.6   5.93  42.1  4.90
## 2 male   bill_leng…   168  34.6  59.6   46.8  41.0  50.3  9.35  6.67  45.9  5.37
## 3 <NA>   bill_leng…     9  34.1  47.3   42    37.8  44.5  6.7   6.23  41.3  4.63
## # … with 2 more variables: se <dbl>, ci <dbl>

3) dlookr::diagnose_numeric()

group_by() - (X)

data %>%
  diagnose_numeric(bill_length_mm)
## # A tibble: 1 × 10
##   variables        min    Q1  mean median    Q3   max  zero minus outlier
## * <chr>          <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <int> <int>   <int>
## 1 bill_length_mm  32.1  39.2  43.9   44.4  48.5  59.6     0     0       0



3. Crosstab - chi square

1) janitor::tabyl

data %>% 
  tabyl(species, sex
        , show_na = T
        , show_missing_levels = T) %>%
  adorn_totals(where = "row") %>% # add a total row
  adorn_percentages(denominator = "col") %>% # convert counts to proportions
  adorn_pct_formatting() %>% # convert proportions to percents
  adorn_ns(position = "front") # display as: "count (percent)"
##    species       female         male         NA_
##     Adelie  73  (44.2%)  73  (43.5%)  6  (54.5%)
##  Chinstrap  34  (20.6%)  34  (20.2%)  0   (0.0%)
##     Gentoo  58  (35.2%)  61  (36.3%)  5  (45.5%)
##      Total 165 (100.0%) 168 (100.0%) 11 (100.0%)

2) gtsummary::tbl_summary

data %>%
    dplyr::select(sex, species) %>%
    tbl_summary(by = sex,
                missing = "ifany", #if any missing? show
                statistic = list(all_continuous() ~ "{mean} ({sd})", # stats and format for continuous columns
                                 all_categorical() ~ "{n} / {N} ({p}%)"), # stats and format for categorical columns
                                 digits = all_continuous() ~ 1, # rounding for continuous columns
                                 type = all_categorical() ~ "categorical") %>% # force all categorical levels to display
      add_p() # specify what tests to perform
## 11 observations missing `sex` have been removed. To include these observations, use `forcats::fct_explicit_na()` on `sex` column before passing to `tbl_summary()`.
Characteristic female, N = 1651 male, N = 1681 p-value2
species >0.9
    Adelie 73 / 165 (44%) 73 / 168 (43%)
    Chinstrap 34 / 165 (21%) 34 / 168 (20%)
    Gentoo 58 / 165 (35%) 61 / 168 (36%)
1 n / N (%)
2 Pearson's Chi-squared test



4. ttest

1) t.test

t.test(bill_length_mm ~ sex, data = data)
## 
##  Welch Two Sample t-test
## 
## data:  bill_length_mm by sex
## t = -6.6725, df = 329.29, p-value = 1.066e-10
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
##  -4.865676 -2.649908
## sample estimates:
## mean in group female   mean in group male 
##             42.09697             45.85476

2) rstatix - t_test

data %>%
  t_test(bill_length_mm ~ sex)
## # A tibble: 1 × 10
##   .y.   group1 group2    n1    n2 statistic    df        p    p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>    <dbl>    <dbl> <chr>       
## 1 bill… female male     165   168     -6.67  329. 1.07e-10 1.07e-10 ****

3) gtsummary - ttest + chi (multiple variables)

data %>%
    dplyr::select(sex, bill_length_mm, bill_depth_mm, flipper_length_mm,
           island, species) %>%
    tbl_summary(by = sex,
                missing = "ifany", #if any missing? show
                statistic = list(all_continuous() ~ "{mean} ({sd})", # stats and format for continuous columns
                                 all_categorical() ~ "{n} / {N} ({p}%)"), # stats and format for categorical columns
                                 digits = all_continuous() ~ 1, # rounding for continuous columns
                                 type = all_categorical() ~ "categorical") %>% # force all categorical levels to display
    add_p(all_continuous() ~ "t.test") # specify what tests to perform
## 11 observations missing `sex` have been removed. To include these observations, use `forcats::fct_explicit_na()` on `sex` column before passing to `tbl_summary()`.
Characteristic female, N = 1651 male, N = 1681 p-value2
bill_length_mm 42.1 (4.9) 45.9 (5.4) <0.001
bill_depth_mm 16.4 (1.8) 17.9 (1.9) <0.001
flipper_length_mm 197.4 (12.5) 204.5 (14.5) <0.001
island >0.9
    Biscoe 80 / 165 (48%) 83 / 168 (49%)
    Dream 61 / 165 (37%) 62 / 168 (37%)
    Torgersen 24 / 165 (15%) 23 / 168 (14%)
species >0.9
    Adelie 73 / 165 (44%) 73 / 168 (43%)
    Chinstrap 34 / 165 (21%) 34 / 168 (20%)
    Gentoo 58 / 165 (35%) 61 / 168 (36%)
1 Mean (SD); n / N (%)
2 Welch Two Sample t-test; Pearson's Chi-squared test



5. ANOVA

1) aov

https://statsandr.com/blog/anova-in-r/

anova <- aov(bill_length_mm ~ species, data = data)
summary(anova)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## species       2   7194    3597   410.6 <2e-16 ***
## Residuals   339   2970       9                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 2 observations deleted due to missingness
TukeyHSD(anova) #pairwise comparison
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = bill_length_mm ~ species, data = data)
## 
## $species
##                       diff       lwr        upr     p adj
## Chinstrap-Adelie 10.042433  9.024859 11.0600064 0.0000000
## Gentoo-Adelie     8.713487  7.867194  9.5597807 0.0000000
## Gentoo-Chinstrap -1.328945 -2.381868 -0.2760231 0.0088993

2) gtsummary - anova + chi (multiple variables)

data %>%
    dplyr::select(sex, bill_length_mm, bill_depth_mm, flipper_length_mm,
           island, species) %>%
    tbl_summary(by = species,
                missing = "no",
                statistic = list(all_continuous() ~ "{mean} ({sd})", # stats and format for continuous columns
                all_categorical() ~ "{n} / {N} ({p}%)"), # stats and format for categorical columns
                digits = all_continuous() ~ 1, # rounding for continuous columns
                type = all_categorical() ~ "categorical") %>% # force all categorical levels to display
    add_p(all_continuous() ~ "aov") # specify what tests to perform
Characteristic Adelie, N = 1521 Chinstrap, N = 681 Gentoo, N = 1241 p-value2
sex >0.9
    female 73 / 146 (50%) 34 / 68 (50%) 58 / 119 (49%)
    male 73 / 146 (50%) 34 / 68 (50%) 61 / 119 (51%)
bill_length_mm 38.8 (2.7) 48.8 (3.3) 47.5 (3.1) <0.001
bill_depth_mm 18.3 (1.2) 18.4 (1.1) 15.0 (1.0) <0.001
flipper_length_mm 190.0 (6.5) 195.8 (7.1) 217.2 (6.5) <0.001
island <0.001
    Biscoe 44 / 152 (29%) 0 / 68 (0%) 124 / 124 (100%)
    Dream 56 / 152 (37%) 68 / 68 (100%) 0 / 124 (0%)
    Torgersen 52 / 152 (34%) 0 / 68 (0%) 0 / 124 (0%)
1 n / N (%); Mean (SD)
2 Pearson's Chi-squared test; One-way ANOVA