references

library(tidyverse)
library(dlookr)
library(palmerpenguins)
library(janitor)
library(gtsummary)
library(rstatix)
library(multcomp)

data diagnose

data <- janitor::clean_names(penguins)

glimpse(data)

## Rows: 344
## Columns: 8
## $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex               <fct> male, female, female, NA, female, male, female, male…
## $ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…

skimr::skim(data)

Data summary

Name	data
Number of rows	344
Number of columns	8
_______________________
Column type frequency:
factor	3
numeric	5
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
species	0	1.00	FALSE	3	Ade: 152, Gen: 124, Chi: 68
island	0	1.00	FALSE	3	Bis: 168, Dre: 124, Tor: 52
sex	11	0.97	FALSE	2	mal: 168, fem: 165

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
bill_length_mm	2	0.99	43.92	5.46	32.1	39.23	44.45	48.5	59.6	▃▇▇▆▁
bill_depth_mm	2	0.99	17.15	1.97	13.1	15.60	17.30	18.7	21.5	▅▅▇▇▂
flipper_length_mm	2	0.99	200.92	14.06	172.0	190.00	197.00	213.0	231.0	▂▇▃▅▂
body_mass_g	2	0.99	4201.75	801.95	2700.0	3550.00	4050.00	4750.0	6300.0	▃▇▆▃▂
year	0	1.00	2008.03	0.82	2007.0	2007.00	2008.00	2009.0	2009.0	▇▁▇▁▇

dlookr::diagnose(data)

## # A tibble: 8 × 6
##   variables         types missing_count missing_percent unique_count unique_rate
##   <chr>             <chr>         <int>           <dbl>        <int>       <dbl>
## 1 species           fact…             0           0                3     0.00872
## 2 island            fact…             0           0                3     0.00872
## 3 bill_length_mm    nume…             2           0.581          165     0.480  
## 4 bill_depth_mm     nume…             2           0.581           81     0.235  
## 5 flipper_length_mm inte…             2           0.581           56     0.163  
## 6 body_mass_g       inte…             2           0.581           95     0.276  
## 7 sex               fact…            11           3.20             3     0.00872
## 8 year              inte…             0           0                3     0.00872

1. Frequency Table

1) janitor::tabyl

data %>% 
  tabyl(species
        , show_na = T
        , show_missing_levels = T) %>% 
  adorn_pct_formatting(digits = 2) %>% # %
  adorn_totals(where = "row") # add a total row

##    species   n percent
##     Adelie 152  44.19%
##  Chinstrap  68  19.77%
##     Gentoo 124  36.05%
##      Total 344       -

2) gtsummary::tbl_summary

data %>%
  dplyr::select(species) %>%
    tbl_summary(missing = "ifany", #if any missing? show
                statistic = list(all_continuous() ~ "{mean} ({sd})", # stats and format for continuous columns
                                 all_categorical() ~ "{n} / {N} ({p}%)"), # stats and format for categorical columns
                                 digits = all_continuous() ~ 1, # rounding for continuous columns
                                 type = all_categorical() ~ "categorical") # force all categorical levels to display

Characteristic	N = 344¹
species
Adelie	152 / 344 (44%)
Chinstrap	68 / 344 (20%)
Gentoo	124 / 344 (36%)
¹ n / N (%)

3) dlookr::diagnose_category()

data %>%
  diagnose_category(species)

## # A tibble: 3 × 6
##   variables levels        N  freq ratio  rank
##   <chr>     <chr>     <int> <int> <dbl> <int>
## 1 species   Adelie      344   152  44.2     1
## 2 species   Gentoo      344   124  36.0     2
## 3 species   Chinstrap   344    68  19.8     3

2. descriptive statistics

1) summarize

data %>% 
  summarize(MEAN = mean(bill_length_mm, na.rm = T)
            , SD = sd(bill_length_mm, na.rm = T)
            , SUM = sum(bill_length_mm, na.rm = T)
            , MAX = max(bill_length_mm, na.rm = T)
            , MIN = min(bill_length_mm, na.rm = T)
            , N = n()
            , "NA" = sum(is.na(bill_length_mm)))

## # A tibble: 1 × 7
##    MEAN    SD    SUM   MAX   MIN     N  `NA`
##   <dbl> <dbl>  <dbl> <dbl> <dbl> <int> <int>
## 1  43.9  5.46 15021.  59.6  32.1   344     2

by SEX

data %>%
  group_by(sex) %>% # by sex
  summarize(MEAN = mean(bill_length_mm, na.rm = T)
            , SD = sd(bill_length_mm, na.rm = T)
            , SUM = sum(bill_length_mm, na.rm = T)
            , MAX = max(bill_length_mm, na.rm = T)
            , MIN = min(bill_length_mm, na.rm = T)
            , N = n()
            , "NA" = sum(is.na(bill_length_mm)))

## # A tibble: 3 × 8
##   sex     MEAN    SD   SUM   MAX   MIN     N  `NA`
##   <fct>  <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 female  42.1  4.90 6946   58    32.1   165     0
## 2 male    45.9  5.37 7704.  59.6  34.6   168     0
## 3 <NA>    41.3  4.63  372.  47.3  34.1    11     2

2) rstatix::get_summary_stats

data %>%
  get_summary_stats(bill_length_mm)

## # A tibble: 1 × 13
##   variable        n   min   max median    q1    q3   iqr   mad  mean    sd    se
##   <chr>       <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bill_lengt…   342  32.1  59.6   44.4  39.2  48.5  9.28  7.04  43.9  5.46 0.295
## # … with 1 more variable: ci <dbl>

data %>%
  group_by(sex) %>%
  get_summary_stats(bill_length_mm)

## # A tibble: 3 × 14
##   sex    variable       n   min   max median    q1    q3   iqr   mad  mean    sd
##   <fct>  <chr>      <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 female bill_leng…   165  32.1  58     42.8  37.6  46.2  8.6   5.93  42.1  4.90
## 2 male   bill_leng…   168  34.6  59.6   46.8  41.0  50.3  9.35  6.67  45.9  5.37
## 3 <NA>   bill_leng…     9  34.1  47.3   42    37.8  44.5  6.7   6.23  41.3  4.63
## # … with 2 more variables: se <dbl>, ci <dbl>

3) dlookr::diagnose_numeric()

group_by() - (X)

data %>%
  diagnose_numeric(bill_length_mm)

## # A tibble: 1 × 10
##   variables        min    Q1  mean median    Q3   max  zero minus outlier
## * <chr>          <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <int> <int>   <int>
## 1 bill_length_mm  32.1  39.2  43.9   44.4  48.5  59.6     0     0       0

3. Crosstab - chi square

1) janitor::tabyl

data %>% 
  tabyl(species, sex
        , show_na = T
        , show_missing_levels = T) %>%
  adorn_totals(where = "row") %>% # add a total row
  adorn_percentages(denominator = "col") %>% # convert counts to proportions
  adorn_pct_formatting() %>% # convert proportions to percents
  adorn_ns(position = "front") # display as: "count (percent)"

##    species       female         male         NA_
##     Adelie  73  (44.2%)  73  (43.5%)  6  (54.5%)
##  Chinstrap  34  (20.6%)  34  (20.2%)  0   (0.0%)
##     Gentoo  58  (35.2%)  61  (36.3%)  5  (45.5%)
##      Total 165 (100.0%) 168 (100.0%) 11 (100.0%)

2) gtsummary::tbl_summary

data %>%
    dplyr::select(sex, species) %>%
    tbl_summary(by = sex,
                missing = "ifany", #if any missing? show
                statistic = list(all_continuous() ~ "{mean} ({sd})", # stats and format for continuous columns
                                 all_categorical() ~ "{n} / {N} ({p}%)"), # stats and format for categorical columns
                                 digits = all_continuous() ~ 1, # rounding for continuous columns
                                 type = all_categorical() ~ "categorical") %>% # force all categorical levels to display
      add_p() # specify what tests to perform

## 11 observations missing `sex` have been removed. To include these observations, use `forcats::fct_explicit_na()` on `sex` column before passing to `tbl_summary()`.

Characteristic	female, N = 165¹	male, N = 168¹	p-value²
species			>0.9
Adelie	73 / 165 (44%)	73 / 168 (43%)
Chinstrap	34 / 165 (21%)	34 / 168 (20%)
Gentoo	58 / 165 (35%)	61 / 168 (36%)
¹ n / N (%)
² Pearson's Chi-squared test

4. ttest

1) t.test

t.test(bill_length_mm ~ sex, data = data)

## 
##  Welch Two Sample t-test
## 
## data:  bill_length_mm by sex
## t = -6.6725, df = 329.29, p-value = 1.066e-10
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
##  -4.865676 -2.649908
## sample estimates:
## mean in group female   mean in group male 
##             42.09697             45.85476

2) rstatix - t_test

data %>%
  t_test(bill_length_mm ~ sex)

## # A tibble: 1 × 10
##   .y.   group1 group2    n1    n2 statistic    df        p    p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>    <dbl>    <dbl> <chr>       
## 1 bill… female male     165   168     -6.67  329. 1.07e-10 1.07e-10 ****

3) gtsummary - ttest + chi (multiple variables)

data %>%
    dplyr::select(sex, bill_length_mm, bill_depth_mm, flipper_length_mm,
           island, species) %>%
    tbl_summary(by = sex,
                missing = "ifany", #if any missing? show
                statistic = list(all_continuous() ~ "{mean} ({sd})", # stats and format for continuous columns
                                 all_categorical() ~ "{n} / {N} ({p}%)"), # stats and format for categorical columns
                                 digits = all_continuous() ~ 1, # rounding for continuous columns
                                 type = all_categorical() ~ "categorical") %>% # force all categorical levels to display
    add_p(all_continuous() ~ "t.test") # specify what tests to perform

## 11 observations missing `sex` have been removed. To include these observations, use `forcats::fct_explicit_na()` on `sex` column before passing to `tbl_summary()`.

Characteristic	female, N = 165¹	male, N = 168¹	p-value²
bill_length_mm	42.1 (4.9)	45.9 (5.4)	<0.001
bill_depth_mm	16.4 (1.8)	17.9 (1.9)	<0.001
flipper_length_mm	197.4 (12.5)	204.5 (14.5)	<0.001
island			>0.9
Biscoe	80 / 165 (48%)	83 / 168 (49%)
Dream	61 / 165 (37%)	62 / 168 (37%)
Torgersen	24 / 165 (15%)	23 / 168 (14%)
species			>0.9
Adelie	73 / 165 (44%)	73 / 168 (43%)
Chinstrap	34 / 165 (21%)	34 / 168 (20%)
Gentoo	58 / 165 (35%)	61 / 168 (36%)
¹ Mean (SD); n / N (%)
² Welch Two Sample t-test; Pearson's Chi-squared test

5. ANOVA

1) aov

https://statsandr.com/blog/anova-in-r/

anova <- aov(bill_length_mm ~ species, data = data)
summary(anova)

##              Df Sum Sq Mean Sq F value Pr(>F)    
## species       2   7194    3597   410.6 <2e-16 ***
## Residuals   339   2970       9                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 2 observations deleted due to missingness

TukeyHSD(anova) #pairwise comparison

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = bill_length_mm ~ species, data = data)
## 
## $species
##                       diff       lwr        upr     p adj
## Chinstrap-Adelie 10.042433  9.024859 11.0600064 0.0000000
## Gentoo-Adelie     8.713487  7.867194  9.5597807 0.0000000
## Gentoo-Chinstrap -1.328945 -2.381868 -0.2760231 0.0088993

2) gtsummary - anova + chi (multiple variables)

data %>%
    dplyr::select(sex, bill_length_mm, bill_depth_mm, flipper_length_mm,
           island, species) %>%
    tbl_summary(by = species,
                missing = "no",
                statistic = list(all_continuous() ~ "{mean} ({sd})", # stats and format for continuous columns
                all_categorical() ~ "{n} / {N} ({p}%)"), # stats and format for categorical columns
                digits = all_continuous() ~ 1, # rounding for continuous columns
                type = all_categorical() ~ "categorical") %>% # force all categorical levels to display
    add_p(all_continuous() ~ "aov") # specify what tests to perform

Characteristic	Adelie, N = 152¹	Chinstrap, N = 68¹	Gentoo, N = 124¹	p-value²
sex				>0.9
female	73 / 146 (50%)	34 / 68 (50%)	58 / 119 (49%)
male	73 / 146 (50%)	34 / 68 (50%)	61 / 119 (51%)
bill_length_mm	38.8 (2.7)	48.8 (3.3)	47.5 (3.1)	<0.001
bill_depth_mm	18.3 (1.2)	18.4 (1.1)	15.0 (1.0)	<0.001
flipper_length_mm	190.0 (6.5)	195.8 (7.1)	217.2 (6.5)	<0.001
island				<0.001
Biscoe	44 / 152 (29%)	0 / 68 (0%)	124 / 124 (100%)
Dream	56 / 152 (37%)	68 / 68 (100%)	0 / 124 (0%)
Torgersen	52 / 152 (34%)	0 / 68 (0%)	0 / 124 (0%)
¹ n / N (%); Mean (SD)
² Pearson's Chi-squared test; One-way ANOVA

Basic statistical test: frequency, chi, t, anova

Heeyoung

2022-10-17

references

general

general

general

gtsummary

janitor

janitor

dlookr

data diagnose

1. Frequency Table

1) janitor::tabyl

2) gtsummary::tbl_summary

3) dlookr::diagnose_category()

2. descriptive statistics

1) summarize

by SEX

2) rstatix::get_summary_stats

3) dlookr::diagnose_numeric()

group_by() - (X)

3. Crosstab - chi square

1) janitor::tabyl

2) gtsummary::tbl_summary

4. ttest

1) t.test

2) rstatix - t_test

3) gtsummary - ttest + chi (multiple variables)

5. ANOVA

1) aov

2) gtsummary - anova + chi (multiple variables)