data diagnose
data <- janitor::clean_names(penguins)
glimpse(data)
## Rows: 344
## Columns: 8
## $ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex <fct> male, female, female, NA, female, male, female, male…
## $ year <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
skimr::skim(data)
Data summary
|
|
|
|
Name
|
data
|
|
Number of rows
|
344
|
|
Number of columns
|
8
|
|
_______________________
|
|
|
Column type frequency:
|
|
|
factor
|
3
|
|
numeric
|
5
|
|
________________________
|
|
|
Group variables
|
None
|
Variable type: factor
|
skim_variable
|
n_missing
|
complete_rate
|
ordered
|
n_unique
|
top_counts
|
|
species
|
0
|
1.00
|
FALSE
|
3
|
Ade: 152, Gen: 124, Chi: 68
|
|
island
|
0
|
1.00
|
FALSE
|
3
|
Bis: 168, Dre: 124, Tor: 52
|
|
sex
|
11
|
0.97
|
FALSE
|
2
|
mal: 168, fem: 165
|
Variable type: numeric
|
skim_variable
|
n_missing
|
complete_rate
|
mean
|
sd
|
p0
|
p25
|
p50
|
p75
|
p100
|
hist
|
|
bill_length_mm
|
2
|
0.99
|
43.92
|
5.46
|
32.1
|
39.23
|
44.45
|
48.5
|
59.6
|
▃▇▇▆▁
|
|
bill_depth_mm
|
2
|
0.99
|
17.15
|
1.97
|
13.1
|
15.60
|
17.30
|
18.7
|
21.5
|
▅▅▇▇▂
|
|
flipper_length_mm
|
2
|
0.99
|
200.92
|
14.06
|
172.0
|
190.00
|
197.00
|
213.0
|
231.0
|
▂▇▃▅▂
|
|
body_mass_g
|
2
|
0.99
|
4201.75
|
801.95
|
2700.0
|
3550.00
|
4050.00
|
4750.0
|
6300.0
|
▃▇▆▃▂
|
|
year
|
0
|
1.00
|
2008.03
|
0.82
|
2007.0
|
2007.00
|
2008.00
|
2009.0
|
2009.0
|
▇▁▇▁▇
|
dlookr::diagnose(data)
## # A tibble: 8 × 6
## variables types missing_count missing_percent unique_count unique_rate
## <chr> <chr> <int> <dbl> <int> <dbl>
## 1 species fact… 0 0 3 0.00872
## 2 island fact… 0 0 3 0.00872
## 3 bill_length_mm nume… 2 0.581 165 0.480
## 4 bill_depth_mm nume… 2 0.581 81 0.235
## 5 flipper_length_mm inte… 2 0.581 56 0.163
## 6 body_mass_g inte… 2 0.581 95 0.276
## 7 sex fact… 11 3.20 3 0.00872
## 8 year inte… 0 0 3 0.00872
1. Frequency Table
1) janitor::tabyl
data %>%
tabyl(species
, show_na = T
, show_missing_levels = T) %>%
adorn_pct_formatting(digits = 2) %>% # %
adorn_totals(where = "row") # add a total row
## species n percent
## Adelie 152 44.19%
## Chinstrap 68 19.77%
## Gentoo 124 36.05%
## Total 344 -
2) gtsummary::tbl_summary
data %>%
dplyr::select(species) %>%
tbl_summary(missing = "ifany", #if any missing? show
statistic = list(all_continuous() ~ "{mean} ({sd})", # stats and format for continuous columns
all_categorical() ~ "{n} / {N} ({p}%)"), # stats and format for categorical columns
digits = all_continuous() ~ 1, # rounding for continuous columns
type = all_categorical() ~ "categorical") # force all categorical levels to display
| Characteristic |
N = 344 |
| species |
|
| Adelie |
152 / 344 (44%) |
| Chinstrap |
68 / 344 (20%) |
| Gentoo |
124 / 344 (36%) |
3) dlookr::diagnose_category()
data %>%
diagnose_category(species)
## # A tibble: 3 × 6
## variables levels N freq ratio rank
## <chr> <chr> <int> <int> <dbl> <int>
## 1 species Adelie 344 152 44.2 1
## 2 species Gentoo 344 124 36.0 2
## 3 species Chinstrap 344 68 19.8 3
2. descriptive statistics
1) summarize
data %>%
summarize(MEAN = mean(bill_length_mm, na.rm = T)
, SD = sd(bill_length_mm, na.rm = T)
, SUM = sum(bill_length_mm, na.rm = T)
, MAX = max(bill_length_mm, na.rm = T)
, MIN = min(bill_length_mm, na.rm = T)
, N = n()
, "NA" = sum(is.na(bill_length_mm)))
## # A tibble: 1 × 7
## MEAN SD SUM MAX MIN N `NA`
## <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 43.9 5.46 15021. 59.6 32.1 344 2
by SEX
data %>%
group_by(sex) %>% # by sex
summarize(MEAN = mean(bill_length_mm, na.rm = T)
, SD = sd(bill_length_mm, na.rm = T)
, SUM = sum(bill_length_mm, na.rm = T)
, MAX = max(bill_length_mm, na.rm = T)
, MIN = min(bill_length_mm, na.rm = T)
, N = n()
, "NA" = sum(is.na(bill_length_mm)))
## # A tibble: 3 × 8
## sex MEAN SD SUM MAX MIN N `NA`
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 female 42.1 4.90 6946 58 32.1 165 0
## 2 male 45.9 5.37 7704. 59.6 34.6 168 0
## 3 <NA> 41.3 4.63 372. 47.3 34.1 11 2
2) rstatix::get_summary_stats
data %>%
get_summary_stats(bill_length_mm)
## # A tibble: 1 × 13
## variable n min max median q1 q3 iqr mad mean sd se
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bill_lengt… 342 32.1 59.6 44.4 39.2 48.5 9.28 7.04 43.9 5.46 0.295
## # … with 1 more variable: ci <dbl>
data %>%
group_by(sex) %>%
get_summary_stats(bill_length_mm)
## # A tibble: 3 × 14
## sex variable n min max median q1 q3 iqr mad mean sd
## <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 female bill_leng… 165 32.1 58 42.8 37.6 46.2 8.6 5.93 42.1 4.90
## 2 male bill_leng… 168 34.6 59.6 46.8 41.0 50.3 9.35 6.67 45.9 5.37
## 3 <NA> bill_leng… 9 34.1 47.3 42 37.8 44.5 6.7 6.23 41.3 4.63
## # … with 2 more variables: se <dbl>, ci <dbl>
3) dlookr::diagnose_numeric()
group_by() - (X)
data %>%
diagnose_numeric(bill_length_mm)
## # A tibble: 1 × 10
## variables min Q1 mean median Q3 max zero minus outlier
## * <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 bill_length_mm 32.1 39.2 43.9 44.4 48.5 59.6 0 0 0
3. Crosstab - chi square
1) janitor::tabyl
data %>%
tabyl(species, sex
, show_na = T
, show_missing_levels = T) %>%
adorn_totals(where = "row") %>% # add a total row
adorn_percentages(denominator = "col") %>% # convert counts to proportions
adorn_pct_formatting() %>% # convert proportions to percents
adorn_ns(position = "front") # display as: "count (percent)"
## species female male NA_
## Adelie 73 (44.2%) 73 (43.5%) 6 (54.5%)
## Chinstrap 34 (20.6%) 34 (20.2%) 0 (0.0%)
## Gentoo 58 (35.2%) 61 (36.3%) 5 (45.5%)
## Total 165 (100.0%) 168 (100.0%) 11 (100.0%)
2) gtsummary::tbl_summary
data %>%
dplyr::select(sex, species) %>%
tbl_summary(by = sex,
missing = "ifany", #if any missing? show
statistic = list(all_continuous() ~ "{mean} ({sd})", # stats and format for continuous columns
all_categorical() ~ "{n} / {N} ({p}%)"), # stats and format for categorical columns
digits = all_continuous() ~ 1, # rounding for continuous columns
type = all_categorical() ~ "categorical") %>% # force all categorical levels to display
add_p() # specify what tests to perform
## 11 observations missing `sex` have been removed. To include these observations, use `forcats::fct_explicit_na()` on `sex` column before passing to `tbl_summary()`.
| Characteristic |
female, N = 165 |
male, N = 168 |
p-value |
| species |
|
|
>0.9 |
| Adelie |
73 / 165 (44%) |
73 / 168 (43%) |
|
| Chinstrap |
34 / 165 (21%) |
34 / 168 (20%) |
|
| Gentoo |
58 / 165 (35%) |
61 / 168 (36%) |
|
4. ttest
1) t.test
t.test(bill_length_mm ~ sex, data = data)
##
## Welch Two Sample t-test
##
## data: bill_length_mm by sex
## t = -6.6725, df = 329.29, p-value = 1.066e-10
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
## -4.865676 -2.649908
## sample estimates:
## mean in group female mean in group male
## 42.09697 45.85476
2) rstatix - t_test
data %>%
t_test(bill_length_mm ~ sex)
## # A tibble: 1 × 10
## .y. group1 group2 n1 n2 statistic df p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 bill… female male 165 168 -6.67 329. 1.07e-10 1.07e-10 ****
3) gtsummary - ttest + chi (multiple variables)
data %>%
dplyr::select(sex, bill_length_mm, bill_depth_mm, flipper_length_mm,
island, species) %>%
tbl_summary(by = sex,
missing = "ifany", #if any missing? show
statistic = list(all_continuous() ~ "{mean} ({sd})", # stats and format for continuous columns
all_categorical() ~ "{n} / {N} ({p}%)"), # stats and format for categorical columns
digits = all_continuous() ~ 1, # rounding for continuous columns
type = all_categorical() ~ "categorical") %>% # force all categorical levels to display
add_p(all_continuous() ~ "t.test") # specify what tests to perform
## 11 observations missing `sex` have been removed. To include these observations, use `forcats::fct_explicit_na()` on `sex` column before passing to `tbl_summary()`.
| Characteristic |
female, N = 165 |
male, N = 168 |
p-value |
| bill_length_mm |
42.1 (4.9) |
45.9 (5.4) |
<0.001 |
| bill_depth_mm |
16.4 (1.8) |
17.9 (1.9) |
<0.001 |
| flipper_length_mm |
197.4 (12.5) |
204.5 (14.5) |
<0.001 |
| island |
|
|
>0.9 |
| Biscoe |
80 / 165 (48%) |
83 / 168 (49%) |
|
| Dream |
61 / 165 (37%) |
62 / 168 (37%) |
|
| Torgersen |
24 / 165 (15%) |
23 / 168 (14%) |
|
| species |
|
|
>0.9 |
| Adelie |
73 / 165 (44%) |
73 / 168 (43%) |
|
| Chinstrap |
34 / 165 (21%) |
34 / 168 (20%) |
|
| Gentoo |
58 / 165 (35%) |
61 / 168 (36%) |
|
5. ANOVA
1) aov
https://statsandr.com/blog/anova-in-r/
anova <- aov(bill_length_mm ~ species, data = data)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## species 2 7194 3597 410.6 <2e-16 ***
## Residuals 339 2970 9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 2 observations deleted due to missingness
TukeyHSD(anova) #pairwise comparison
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = bill_length_mm ~ species, data = data)
##
## $species
## diff lwr upr p adj
## Chinstrap-Adelie 10.042433 9.024859 11.0600064 0.0000000
## Gentoo-Adelie 8.713487 7.867194 9.5597807 0.0000000
## Gentoo-Chinstrap -1.328945 -2.381868 -0.2760231 0.0088993
2) gtsummary - anova + chi (multiple variables)
data %>%
dplyr::select(sex, bill_length_mm, bill_depth_mm, flipper_length_mm,
island, species) %>%
tbl_summary(by = species,
missing = "no",
statistic = list(all_continuous() ~ "{mean} ({sd})", # stats and format for continuous columns
all_categorical() ~ "{n} / {N} ({p}%)"), # stats and format for categorical columns
digits = all_continuous() ~ 1, # rounding for continuous columns
type = all_categorical() ~ "categorical") %>% # force all categorical levels to display
add_p(all_continuous() ~ "aov") # specify what tests to perform
| Characteristic |
Adelie, N = 152 |
Chinstrap, N = 68 |
Gentoo, N = 124 |
p-value |
| sex |
|
|
|
>0.9 |
| female |
73 / 146 (50%) |
34 / 68 (50%) |
58 / 119 (49%) |
|
| male |
73 / 146 (50%) |
34 / 68 (50%) |
61 / 119 (51%) |
|
| bill_length_mm |
38.8 (2.7) |
48.8 (3.3) |
47.5 (3.1) |
<0.001 |
| bill_depth_mm |
18.3 (1.2) |
18.4 (1.1) |
15.0 (1.0) |
<0.001 |
| flipper_length_mm |
190.0 (6.5) |
195.8 (7.1) |
217.2 (6.5) |
<0.001 |
| island |
|
|
|
<0.001 |
| Biscoe |
44 / 152 (29%) |
0 / 68 (0%) |
124 / 124 (100%) |
|
| Dream |
56 / 152 (37%) |
68 / 68 (100%) |
0 / 124 (0%) |
|
| Torgersen |
52 / 152 (34%) |
0 / 68 (0%) |
0 / 124 (0%) |
|